├── Chapter-08 ├── mlflow-util.py ├── automated-testing.py └── scheduling-workflow-for-model-retraining.py ├── LICENSE ├── Chapter-09 ├── config │ └── setup.py ├── data │ └── datagen.py ├── util │ ├── training.py │ └── monitoring.py └── model-drift.py ├── Chapter-07 ├── custom-python-libraries.py ├── custom-model.py ├── real-time.py ├── batch-and-streaming.py └── real-time-additional.py ├── Chapter-04 ├── mlflow-without-featurestore.py └── mlflow-with-featurestore.py ├── README.md ├── Chapter-03 └── churn-analysis.py └── Chapter-06 └── model-registry-and-webhooks.py /Chapter-08/mlflow-util.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import mlflow 3 | from mlflow.utils.rest_utils import http_request 4 | import json 5 | 6 | def client(): 7 | return mlflow.tracking.client.MlflowClient() 8 | 9 | host_creds = client()._tracking_client.store.get_host_creds() 10 | host = host_creds.host 11 | token = host_creds.token 12 | 13 | def mlflow_endpoint(endpoint, method, body='{}'): 14 | if method == 'GET': 15 | response = http_request( 16 | host_creds=host_creds, endpoint="/api/2.0/mlflow/{}".format(endpoint), method=method, params=json.loads(body)) 17 | else: 18 | response = http_request( 19 | host_creds=host_creds, endpoint="/api/2.0/mlflow/{}".format(endpoint), method=method, json=json.loads(body)) 20 | return response.json() 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter-09/config/setup.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC #### Model Drift monitoring on Databricks 4 | # MAGIC 5 | # MAGIC **Requirements** 6 | # MAGIC * The following notebook was developed and tested using [DBR 13.3 LTS ML](https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html) 7 | # MAGIC 8 | # MAGIC **Authors** 9 | # MAGIC - Debu Sinha | debusinha2009@gmail.com / debu.sinha@databricks.com 10 | 11 | # COMMAND ---------- 12 | 13 | # MAGIC %md 14 | # MAGIC #1) Setup 15 | 16 | # COMMAND ---------- 17 | 18 | #import mlflow if exists else install notebook scoped libraries 19 | try: 20 | import mlflow 21 | except Exception as e: 22 | %pip install mlflow 23 | 24 | # COMMAND ---------- 25 | 26 | # Get Databricks workspace username 27 | username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply("user") 28 | print(username) 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %md 33 | # MAGIC ## 1.1) Setup Directory structure to store this demo related artifacts 34 | 35 | # COMMAND ---------- 36 | 37 | # Set home directory for our project 38 | project_home_dir = f"/Users/{username}/model_drift/" 39 | 40 | #set location for temporary files created in this module 41 | project_local_tmp_dir = f"/dbfs{project_home_dir}tmp/" 42 | 43 | #this is where we will store raw data in csv format 44 | raw_good_data_path= f"{project_home_dir}data/raw/good" 45 | 46 | #this is location where data for showcasing scenario 1 for feature drift and bug in the the upstream data processing 47 | raw_month2_bad_data_path = f"{project_home_dir}data/raw/bad" 48 | 49 | #this is location for delta table where we will store the gold dataset 50 | months_gold_path = f"{project_home_dir}delta/gold" 51 | 52 | dbutils.fs.rm(project_home_dir, True) 53 | dbutils.fs.rm(project_local_tmp_dir, True) 54 | 55 | #reset folders for data storage 56 | for path in [raw_good_data_path, raw_month2_bad_data_path, months_gold_path]: 57 | print(f"creating {path}") 58 | dbutils.fs.mkdirs(path) 59 | 60 | # COMMAND ---------- 61 | 62 | # MAGIC %fs 63 | # MAGIC ls /Users/debu.sinha@databricks.com/model_drift/data/ 64 | 65 | # COMMAND ---------- 66 | 67 | # MAGIC %md 68 | # MAGIC ## 1.2) MLflow experiment setup 69 | 70 | # COMMAND ---------- 71 | 72 | mlflow_experiment_name = "sales_prediction" 73 | 74 | #this has to be an absolute path in the databricks workspace. 75 | mlflow_experiment_path = f"/Users/{username}/{mlflow_experiment_name}" 76 | 77 | # COMMAND ---------- 78 | 79 | import mlflow 80 | 81 | # We need to get the exact path of experiment 82 | experiment = mlflow.get_experiment_by_name(mlflow_experiment_path) 83 | 84 | if experiment: 85 | experiment_id = experiment.experiment_id 86 | mlflow.delete_experiment(experiment_id) 87 | print(f"Experiment {mlflow_experiment_name} deleted successfully.") 88 | 89 | # Create a new experiment with the specified name 90 | experiment_id = mlflow.create_experiment(mlflow_experiment_path) 91 | print(f"Experiment {mlflow_experiment_path} created successfully with ID {experiment_id}.") 92 | 93 | #set the experment for this module 94 | mlflow.set_experiment(mlflow_experiment_path) 95 | -------------------------------------------------------------------------------- /Chapter-07/custom-python-libraries.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Author 4 | # MAGIC 5 | # MAGIC - **Debu Sinha** 6 | # MAGIC 7 | # MAGIC ## Tested Environment 8 | # MAGIC 9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 12 | # MAGIC 13 | # MAGIC ## Cluster Setup Instructions 14 | # MAGIC 15 | # MAGIC 1. **Create a Cluster**: 16 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 17 | # MAGIC - Under `Policy`, select `Unrestricted`. 18 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 19 | # MAGIC - In `Cluster Mode`, select `Single Node`. 20 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 21 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 22 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 23 | # MAGIC 24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 25 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 26 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 27 | 28 | # COMMAND ---------- 29 | 30 | # Import necessary libraries and modules 31 | import mlflow 32 | import mlflow.sklearn 33 | from sklearn.datasets import load_iris 34 | from sklearn.ensemble import RandomForestClassifier 35 | from pandas import DataFrame 36 | from mlflow.models import infer_signature 37 | from mlflow.models.utils import add_libraries_to_model 38 | 39 | # Initialize the MLflow run 40 | with mlflow.start_run() as run: 41 | # Load the Iris dataset 42 | iris_data = load_iris() 43 | training_data = DataFrame(data=iris_data.data, columns=iris_data.feature_names) 44 | 45 | # Initialize and train the RandomForest Classifier 46 | random_forest_model = RandomForestClassifier(max_depth=7, random_state=42) 47 | random_forest_model.fit(training_data, iris_data.target) 48 | 49 | # Infer model signature for later use 50 | model_signature = infer_signature(training_data, random_forest_model.predict(training_data)) 51 | 52 | # Log the trained model to MLflow 53 | mlflow.sklearn.log_model(random_forest_model, "iris_classifier", 54 | signature=model_signature, 55 | registered_model_name="enhanced_model_with_libraries") 56 | 57 | # Model URI for accessing the registered model 58 | access_model_uri = "models:/enhanced_model_with_libraries/1" 59 | 60 | # Add libraries to the original model run 61 | add_libraries_to_model(access_model_uri) 62 | 63 | # Example to add libraries to an existing run 64 | # prev_run_id = "some_existing_run_id" 65 | # add_libraries_to_model(access_model_uri, run_id=prev_run_id) 66 | 67 | 68 | # Example to add libraries to a new run 69 | with mlflow.start_run(): 70 | add_libraries_to_model(access_model_uri) 71 | 72 | # Example to add libraries and register under a new model name 73 | with mlflow.start_run(): 74 | add_libraries_to_model(access_model_uri, registered_model_name="new_enhanced_model") 75 | -------------------------------------------------------------------------------- /Chapter-08/automated-testing.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC Load the model name. The **`event_message`** is automatically populated by the webhook. 4 | 5 | # COMMAND ---------- 6 | 7 | 8 | import json 9 | 10 | event_message = dbutils.widgets.get("event_message") 11 | event_message_dict = json.loads(event_message) 12 | model_name = event_message_dict.get("model_name") 13 | 14 | print(event_message_dict) 15 | print(model_name) 16 | 17 | # COMMAND ---------- 18 | 19 | # MAGIC %md Use the model name to get the latest model version. 20 | 21 | # COMMAND ---------- 22 | 23 | # MAGIC %run ./mlflow-util 24 | 25 | # COMMAND ---------- 26 | 27 | from mlflow.tracking import MlflowClient 28 | client = MlflowClient() 29 | 30 | version = event_message_dict.get("version") 31 | version 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md Use the model name and version to load a **`pyfunc`** model of our model in staging environment. 36 | 37 | # COMMAND ---------- 38 | 39 | import mlflow 40 | 41 | pyfunc_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{version}") 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %md Get the input schema of our logged model. 46 | 47 | # COMMAND ---------- 48 | 49 | input_schema = pyfunc_model.metadata.get_input_schema().as_spark_schema() 50 | 51 | # COMMAND ---------- 52 | 53 | # MAGIC %md Here we define our expected input schema. 54 | 55 | # COMMAND ---------- 56 | 57 | from pyspark.sql.types import StringType, StructField, IntegerType, DoubleType, StructType 58 | 59 | expected_input_schema = (StructType([ 60 | StructField("CreditScore", IntegerType(), True), 61 | StructField("Geography", StringType(), True), 62 | StructField("Gender", StringType(), True), 63 | StructField("Age", IntegerType(), True), 64 | StructField("Tenure", IntegerType(), True), 65 | StructField("Balance", DoubleType(), True), 66 | StructField("NumOfProducts", IntegerType(), True), 67 | StructField("HasCrCard", IntegerType(), True), 68 | StructField("isActiveMember", IntegerType(), True), 69 | StructField("EstimatedSalary", DoubleType(), True) 70 | ])) 71 | 72 | # COMMAND ---------- 73 | 74 | if expected_input_schema.fields.sort(key=lambda x: x.name) != input_schema.fields.sort(key=lambda x: x.name): 75 | comment = "This model failed input schema check" 76 | comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment} 77 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body)) 78 | raise Exception("Input Schema mismatched") 79 | 80 | # COMMAND ---------- 81 | 82 | # MAGIC %md Load the dataset and generate some predictions to ensure our model is working correctly. 83 | 84 | # COMMAND ---------- 85 | 86 | import pandas as pd 87 | 88 | sample_data = spark.table("bank_churn_analysis.raw_data") 89 | #read the raw dataset provided with the code base 90 | df = sample_data.toPandas() 91 | 92 | #exclude the columns that are not used for prediction 93 | excluded_columns = {"RowNumber", "CustomerId", "Surname"} 94 | df_input = df[[col for col in df.columns if col not in excluded_columns]] 95 | 96 | df_input.head() 97 | 98 | # COMMAND ---------- 99 | 100 | predictions = pyfunc_model.predict(df_input) 101 | 102 | # COMMAND ---------- 103 | 104 | # MAGIC %md Make sure our prediction types are correct. 105 | 106 | # COMMAND ---------- 107 | 108 | import numpy as np 109 | 110 | if type(predictions) != np.ndarray or type(predictions[0]) != np.int32: 111 | comment = "This model prediction check" 112 | comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment} 113 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body)) 114 | raise Exception("Prediction Datatype is not as expected") 115 | 116 | # COMMAND ---------- 117 | 118 | # Leave a comment for the ML engineer who will be reviewing the tests 119 | comment = "This model passed all the tests" 120 | comment_body = {'name': model_name, 'version': version, 'comment': comment} 121 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body)) 122 | -------------------------------------------------------------------------------- /Chapter-08/scheduling-workflow-for-model-retraining.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Training Workflow 4 | # MAGIC 5 | # MAGIC In this notebook, we'll create a workflow to retrain our model. Then, we'll set up this notebook to run monthly using a Databricks Job to ensure our model is always up-to-date. 6 | # MAGIC 7 | # MAGIC ### Load Features 8 | # MAGIC 9 | # MAGIC First, we'll load in our feature table which in this case is the original raw dataset. 10 | # MAGIC 11 | # MAGIC 12 | # MAGIC In the case of this demonstration, these are the same records — but in real-world scenario, we'd likely have updated records appended to this table each time the model is trained. 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %pip install databricks-registry-webhooks 17 | 18 | # COMMAND ---------- 19 | 20 | database_name = "bank_churn_analysis" 21 | 22 | #we will exclude the same columns that we did earlier while training our model using AutoML from UI. 23 | excluded_featured_from_raw = {"RowNumber", "CustomerId", "Surname"} 24 | target_column = "Exited" 25 | 26 | new_data = spark.table(f"{database_name}.raw_data") 27 | features = [c for c in new_data.columns if c not in excluded_featured_from_raw] 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC ## Add webhook for kicking off automated testing job 33 | 34 | # COMMAND ---------- 35 | 36 | # get token from notebook 37 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) 38 | 39 | #create authorization header for REST calls 40 | headers = {"Authorization": f"Bearer {token}"} 41 | 42 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection 43 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags() 44 | 45 | # This object comes from the Java CM 46 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags) 47 | 48 | # extract the databricks instance (domain name) from the dictionary 49 | instance = tags["browserHostName"] 50 | 51 | # COMMAND ---------- 52 | 53 | model_name = "Churn Prediction Bank" 54 | 55 | # COMMAND ---------- 56 | 57 | from databricks_registry_webhooks import RegistryWebhooksClient, JobSpec 58 | 59 | job_spec = JobSpec( 60 | job_id="295266394513960", 61 | workspace_url="https://"+instance, 62 | access_token=token 63 | ) 64 | 65 | job_webhook = RegistryWebhooksClient().create_webhook( 66 | model_name=model_name, 67 | events=["TRANSITION_REQUEST_TO_STAGING_CREATED"], 68 | job_spec=job_spec, 69 | description="Registering webhook to automate testing of a new candidate model for staging" 70 | ) 71 | 72 | job_webhook 73 | 74 | # COMMAND ---------- 75 | 76 | # Test the Job webhook 77 | # RegistryWebhooksClient().test_webhook(id=job_webhook.id) 78 | 79 | # COMMAND ---------- 80 | 81 | # MAGIC %md 82 | # MAGIC ### AutoML Process 83 | # MAGIC 84 | # MAGIC Next, we'll use the AutoML API to kick off an AutoML classification experiment. This is similar to what we did with the AutoML UI, but we can use the API to automate this process. 85 | 86 | # COMMAND ---------- 87 | 88 | import databricks.automl 89 | model = databricks.automl.classify( 90 | new_data.select(features), 91 | target_col=target_column, 92 | primary_metric="f1", 93 | timeout_minutes=5, 94 | max_trials=30, 95 | ) 96 | 97 | # COMMAND ---------- 98 | 99 | #information about the latest automl model training 100 | help(model) 101 | 102 | # COMMAND ---------- 103 | 104 | # MAGIC %md 105 | # MAGIC ### Register the Best Model 106 | # MAGIC 107 | # MAGIC Once the AutoML experiment is done, we can identify the best model from the experiment and register that model to the Model Registry. 108 | 109 | # COMMAND ---------- 110 | 111 | import mlflow 112 | from mlflow.tracking.client import MlflowClient 113 | 114 | client = MlflowClient() 115 | 116 | run_id = model.best_trial.mlflow_run_id 117 | 118 | model_uri = f"runs:/{run_id}/model" 119 | 120 | model_details = mlflow.register_model(model_uri, model_name) 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md 125 | # MAGIC ### Request model Transition to Staging 126 | # MAGIC 127 | # MAGIC Once the model is registered, we request that it be transitioned to the **Staging** stage for testing. 128 | # MAGIC 129 | # MAGIC First, we'll includ a helper function to interact with the MLflow registry API. In your production environment its always a good practice to modularize your code for maintainability. 130 | 131 | # COMMAND ---------- 132 | 133 | # MAGIC %run ./mlflow-util 134 | 135 | # COMMAND ---------- 136 | 137 | # MAGIC %md 138 | # MAGIC Next, we'll set up the transition request using the `mlflow_endpoint` operation from the helpers notebook. 139 | 140 | # COMMAND ---------- 141 | 142 | staging_request = {'name': model_name, 'version': model_details.version, 'stage': 'Staging', 'archive_existing_versions': 'false'} 143 | mlflow_endpoint('transition-requests/create', 'POST', json.dumps(staging_request)) 144 | 145 | # COMMAND ---------- 146 | 147 | # MAGIC %md 148 | # MAGIC And we'll add a comment to the version of the model that we just requested be moved to **Staging** to let the machine learning engineer know why we are making the request. 149 | 150 | # COMMAND ---------- 151 | 152 | # Leave a comment for the ML engineer who will be reviewing the tests 153 | comment = "This was the best model from the most recent AutoML run. Ready for testing" 154 | comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment} 155 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body)) 156 | 157 | # COMMAND ---------- 158 | 159 | 160 | -------------------------------------------------------------------------------- /Chapter-07/custom-model.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Author 4 | # MAGIC 5 | # MAGIC - **Debu Sinha** 6 | # MAGIC 7 | # MAGIC ## Tested Environment 8 | # MAGIC 9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 12 | # MAGIC 13 | # MAGIC ## Cluster Setup Instructions 14 | # MAGIC 15 | # MAGIC 1. **Create a Cluster**: 16 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 17 | # MAGIC - Under `Policy`, select `Unrestricted`. 18 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 19 | # MAGIC - In `Cluster Mode`, select `Single Node`. 20 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 21 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 22 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 23 | # MAGIC 24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 25 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 26 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 27 | # MAGIC 28 | # MAGIC ## Predicting Wine Cultivars using Decision Tree Classifier and MLflow 29 | # MAGIC 30 | # MAGIC This code is designed to solve a multi-class classification problem using the wine dataset. The wine dataset contains 178 samples, each belonging to one of three different cultivars (types of grape) in Italy. Each sample has 13 different features like Alcohol, Malic acid, etc. 31 | # MAGIC 32 | # MAGIC ### Objective 33 | # MAGIC 34 | # MAGIC The objective of the model is to predict the cultivar to which a given wine sample belongs based on its 13 features. In simpler terms, for a new wine sample, the model aims to categorize it as 'class_0', 'class_1', or 'class_2', representing one of the three possible cultivars. Additionally, the model provides the probabilities for the sample belonging to each of these classes. 35 | # MAGIC 36 | # MAGIC ### Implementation 37 | # MAGIC 38 | # MAGIC The code uses a Decision Tree classifier and trains it on a subset of the wine dataset, known as the training set. After training, the model is encapsulated in a custom Python class (`CustomModelWrapper`). This class facilitates the logging of the model using MLflow, a platform for end-to-end machine learning lifecycle management. 39 | # MAGIC 40 | # MAGIC Once the model is logged, it can be deployed and used to make predictions on new, unseen data, commonly referred to as the test set. 41 | 42 | # COMMAND ---------- 43 | 44 | from sklearn.datasets import load_wine 45 | from sklearn.model_selection import train_test_split 46 | from sklearn.tree import DecisionTreeClassifier 47 | from mlflow.models.signature import ModelSignature 48 | from mlflow.types.schema import Schema, ColSpec 49 | import mlflow 50 | import mlflow.pyfunc 51 | import pandas as pd 52 | 53 | # Custom model class 54 | class CustomModelWrapper(mlflow.pyfunc.PythonModel): 55 | # Initialize the classifier model in the constructor 56 | def __init__(self, classifier_model): 57 | self.classifier_model = classifier_model 58 | 59 | # Prediction method 60 | def predict(self, context, model_data): 61 | # Compute the probabilities and the classes 62 | probs = self.classifier_model.predict_proba(model_data) 63 | preds = self.classifier_model.predict(model_data) 64 | 65 | # Create a DataFrame to hold probabilities and predictions 66 | labels = ["class_0", "class_1", "class_2"] 67 | result_df = pd.DataFrame(probs, columns=[f'prob_{label}' for label in labels]) 68 | result_df['prediction'] = [labels[i] for i in preds] 69 | 70 | return result_df 71 | 72 | # Load the wine dataset and split it into training and test sets 73 | wine_data = load_wine() 74 | X, y = wine_data.data, wine_data.target 75 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) 76 | 77 | # Initialize and fit the DecisionTreeClassifier 78 | dt_classifier = DecisionTreeClassifier(random_state=7) 79 | dt_classifier.fit(X_train, y_train) 80 | 81 | # Create an instance of the CustomModelWrapper 82 | custom_wrapper = CustomModelWrapper(dt_classifier) 83 | 84 | # Define the input and output schema 85 | input_cols = [ColSpec("double", feature) for feature in wine_data.feature_names] 86 | output_cols = [ColSpec("double", f'prob_{cls}') for cls in wine_data.target_names] + [ColSpec("string", 'prediction')] 87 | model_sign = ModelSignature(inputs=Schema(input_cols), outputs=Schema(output_cols)) 88 | 89 | # Prepare an example input 90 | input_sample = pd.DataFrame(X_train[:1], columns=wine_data.feature_names) 91 | input_sample_dict = input_sample.to_dict(orient='list') 92 | 93 | # Log the model using MLflow 94 | with mlflow.start_run(): 95 | mlflow.pyfunc.log_model("wine_model", 96 | python_model=custom_wrapper, 97 | input_example=input_sample_dict, 98 | signature=model_sign) 99 | 100 | # Retrieve the run ID and load the logged model 101 | last_run_id = mlflow.last_active_run().info.run_id 102 | retrieved_model = mlflow.pyfunc.load_model(f"runs:/{last_run_id}/wine_model") 103 | 104 | # Create a DataFrame for the test data 105 | test_df = pd.DataFrame(X_test[:1], columns=wine_data.feature_names) 106 | 107 | # Use the loaded model for prediction 108 | prediction_result = retrieved_model.predict(test_df) 109 | 110 | 111 | # COMMAND ---------- 112 | 113 | prediction_result 114 | 115 | # COMMAND ---------- 116 | 117 | 118 | -------------------------------------------------------------------------------- /Chapter-04/mlflow-without-featurestore.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # MLflow introduction. 3 | # MAGIC 4 | # MAGIC This tutorial covers an example of how to use the integrated MLflow tracking capabilities to track your model training with the integrated feature store. 5 | # MAGIC - Import data from the Delta table that contains feature engineered datasets. 6 | # MAGIC - Create a baseline model for churn prediction and store it in the integrated MLflow tracking server. 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC ###0. SETUP -- Databricks Spark cluster: 12 | # MAGIC 13 | # MAGIC 1. **Create** a cluster by... 14 | # MAGIC - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 15 | # MAGIC - In `Policy` select `Unrestricted`. 16 | # MAGIC - Enter any text, i.e `demo` into the cluster name text box. 17 | # MAGIC - Select `Single Node` in the cluster mode. 18 | # MAGIC - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab. 19 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 20 | # MAGIC - Click the `create cluster` button and wait for your cluster to be provisioned 21 | # MAGIC 3. **Attach** this notebook to your cluster by... 22 | # MAGIC - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 23 | 24 | # COMMAND ---------- 25 | 26 | #install latest version of sklearn 27 | %pip install -U scikit-learn 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC ### Step 1) Importing the desired libraries and defining few constants. 33 | # MAGIC 34 | # MAGIC - Note:
35 | # MAGIC - In this example the feature table is the same as we created in Chapter 3, however we will not use the featurestore API to access the data in the feature table.
36 | # MAGIC - As explained in chapter 3, all the offline feature tables are backed as Delta tables and are searchable through the integrated Hive metastore in Databricks. This allows us to read these tables like a regular external or managed table. 37 | 38 | # COMMAND ---------- 39 | 40 | from databricks.feature_store import FeatureStoreClient 41 | from databricks.feature_store import FeatureLookup 42 | import typing 43 | 44 | from sklearn import metrics 45 | from sklearn.ensemble import RandomForestClassifier 46 | from sklearn.model_selection import train_test_split 47 | import mlflow 48 | import pandas as pd 49 | 50 | # COMMAND ---------- 51 | 52 | #Name of experiment where we will track all the different model training runs. 53 | EXPERIMENT_NAME = "Bank_Customer_Churn_Analysis" 54 | #Name of the model 55 | MODEL_NAME = "random_forest_classifier" 56 | #This is the name for the entry in model registry 57 | MODEL_REGISTRY_NAME = "Bank_Customer_Churn" 58 | #The email you use to authenticate in the Databricks workspace 59 | USER_EMAIL = "debu.sinha@databricks.com" 60 | #Location where the MLflow experiement will be listed in user workspace 61 | EXPERIMENT_NAME = f"/Users/{USER_EMAIL}/{EXPERIMENT_NAME}" 62 | # we have all the features backed into a Delta table so we will read directly 63 | FEATURE_TABLE = "bank_churn_analysis.bank_customer_features" 64 | 65 | # COMMAND ---------- 66 | 67 | # MAGIC %md 68 | # MAGIC ### Step 2) Build a simplistic model that uses the feature store table as its source for training and validation. 69 | 70 | # COMMAND ---------- 71 | 72 | # set experiment name 73 | mlflow.set_experiment(EXPERIMENT_NAME) 74 | 75 | with mlflow.start_run(): 76 | TEST_SIZE = 0.20 77 | 78 | # Now we will read the data directly from the feature table 79 | training_df = spark.table(FEATURE_TABLE) 80 | 81 | # convert the dataset to pandas so that we can fit sklearn RandomForestClassifier on it 82 | train_df = training_df.toPandas() 83 | 84 | # The train_df represents the input dataframe that has all the feature columns along with the new raw input in the form of training_df. 85 | X = train_df.drop(['Exited'], axis=1) 86 | y = train_df['Exited'] 87 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=54, stratify=y) 88 | 89 | # here we will are not doing any hyperparameter tuning however, in future we will see how to perform hyperparameter tuning in scalable manner on Databricks. 90 | model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train) 91 | signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train)) 92 | 93 | predictions = model.predict(X_test) 94 | fpr, tpr, _ = metrics.roc_curve(y_test, predictions, pos_label=1) 95 | auc = metrics.auc(fpr, tpr) 96 | accuracy = metrics.accuracy_score(y_test, predictions) 97 | 98 | # get the calculated feature importances. 99 | importances = dict(zip(model.feature_names_in_, model.feature_importances_)) 100 | # log artifact 101 | mlflow.log_dict(importances, "feature_importances.json") 102 | # log metrics 103 | mlflow.log_metric("auc", auc) 104 | mlflow.log_metric("accuracy", accuracy) 105 | # log parameters 106 | mlflow.log_param("split_size", TEST_SIZE) 107 | mlflow.log_params(model.get_params()) 108 | # set tag 109 | mlflow.set_tag(MODEL_NAME, "mlflow demo") 110 | # log the model itself in mlflow tracking server 111 | mlflow.sklearn.log_model(model, MODEL_NAME, signature=signature, input_example=X_train.iloc[:4, :]) 112 | 113 | # COMMAND ---------- 114 | 115 | from mlflow.tracking import MlflowClient 116 | #initialize the mlflow client 117 | client = MlflowClient() 118 | #get the experiment id 119 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id 120 | #get the latest run id which will allow us to directly access the metrics, and attributes and all th einfo 121 | run_id = mlflow.search_runs(experiment_id, order_by=["start_time DESC"]).head(1)["run_id"].values[0] 122 | #now we will register the latest model into the model registry 123 | new_model_version = mlflow.register_model(f"runs:/{run_id}/{MODEL_NAME}", MODEL_REGISTRY_NAME) 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Machine Learning Summit 2025

2 | 3 | ## Machine Learning Summit 2025 4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges** 5 | 6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of: 7 | - **Agentic and Generative AI** 8 | - **Applied Machine Learning in the Real World** 9 | - **ML Engineering and Optimization** 10 | 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh) 12 | 13 | --- 14 | 15 | ## Join Our Newsletters 📬 16 | 17 | ### DataPro 18 | *The future of AI is unfolding. Don’t fall behind.* 19 | 20 |

DataPro QR

21 | 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers. 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast. 24 | 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat. 26 | 27 | --- 28 | 29 | ### BIPro 30 | *Business runs on data. Make sure yours tells the right story.* 31 | 32 |

BIPro QR

33 | 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders. 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**. 36 | 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing. 38 | 39 | # Practical Machine Learning on Databricks 40 | 41 | 42 | 43 | This is the code repository for [Practical Machine Learning on Databricks](https://www.amazon.com/Practical-Data-Science-Databricks-end/dp/1801812039?utm_source=github&utm_medium=repository&utm_campaign=9781837631285), published by Packt. 44 | 45 | **Seamlessly transition ML models and MLOps on Databricks** 46 | 47 | ## What is this book about? 48 | Unleash the potential of databricks for end-to-end machine learning with this comprehensive guide, tailored for experienced data scientists and developers transitioning from DIY or other cloud platforms. Building on a strong foundation in Python, Practical Machine Learning on Databricks serves as your roadmap from development to production, covering all intermediary steps using the databricks platform. 49 | 50 | This book covers the following exciting features: 51 | * Transition smoothly from DIY setups to databricks 52 | * Master AutoML for quick ML experiment setup 53 | * Automate model retraining and deployment 54 | * Leverage databricks feature store for data prep 55 | * Use MLflow for effective experiment tracking 56 | * Gain practical insights for scalable ML solutions 57 | * Find out how to handle model drifts in production environments 58 | 59 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1801812039) today! 60 | 61 | https://www.packtpub.com/ 63 | 64 | ## Instructions and Navigations 65 | All of the code is organized into folders. For example, Chapter02. 66 | 67 | The code will look like the following: 68 | ``` 69 | iris = load_iris() 70 | 71 | X = iris.data # Features 72 | 73 | y = iris.target # Labels 74 | 75 | ``` 76 | 77 | **Following is what you need for this book:** 78 | This book is for experienced data scientists, engineers, and developers proficient in Python, statistics, and ML lifecycle looking to transition to databricks from DIY clouds. Introductory Spark knowledge is a must to make the most out of this book, however, end-to-end ML workflows will be covered. If you aim to accelerate your machine learning workflows and deploy scalable, robust solutions, this book is an indispensable resource. 79 | 80 | With the following software and hardware list you can run all code files present in the book (Chapter 1-10). 81 | ### Software and Hardware List 82 | | Chapter | Software required | OS required | 83 | | -------- | ------------------------------------ | ----------------------------------- | 84 | | 1-10 | Databricks Runtime | Windows and Mac OS | 85 | | 1-10 | Python proficiency (3.x) | Windows and Mac OS | 86 | | 1-10 | Statistics and ML basics | Windows and Mac OS | 87 | | 1-10 | Spark knowledge (3.0 or above) | Windows and Mac OS | 88 | | 1-10 | Delta Lake features (optional) | Windows and Mac OS | 89 | 90 | ### Related products 91 | * Machine Learning for Emotion Analysis in Python [[Packt]](https://www.packtpub.com/product/machine-learning-for-emotion-analysis-in-python/9781803240688?utm_source=github&utm_medium=repository&utm_campaign=9781803240688) [[Amazon]](https://www.amazon.com/dp/1803240687) 92 | 93 | * Machine Learning with LightGBM and Python [[Packt]](https://www.packtpub.com/product/machine-learning-with-lightgbm-and-python/9781800564749?utm_source=github&utm_medium=repository&utm_campaign=9781800564749) [[Amazon]](https://www.amazon.com/dp/1800564740) 94 | 95 | ## Get to Know the Author 96 | **Debu Sinha** 97 | is an experienced data science and engineering leader with deep expertise in software engineering and solutions architecture. With over 10 years in the industry, Debu has a proven track record in designing scalable software applications and big data, and machine learning systems. As lead ML specialist on the Specialist Solutions Architect team at Databricks, Debu focuses on AI/ML use cases in the cloud and serves as an expert on LLMs, ML, and MLOps. With prior experience as a start-up co-founder, Debu has demonstrated skills in team-building, scaling, and delivering impactful software solutions. An established thought leader, Debu has received multiple awards and regularly speaks at industry events. 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /Chapter-07/real-time.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # 3 | 4 | # COMMAND ---------- 5 | 6 | # MAGIC %md 7 | # MAGIC ## Author 8 | # MAGIC 9 | # MAGIC - **Debu Sinha** 10 | # MAGIC 11 | # MAGIC ## Tested Environment 12 | # MAGIC 13 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 14 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 15 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 16 | # MAGIC 17 | # MAGIC ## Cluster Setup Instructions 18 | # MAGIC 19 | # MAGIC 1. **Create a Cluster**: 20 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 21 | # MAGIC - Under `Policy`, select `Unrestricted`. 22 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 23 | # MAGIC - In `Cluster Mode`, select `Single Node`. 24 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 25 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 26 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 27 | # MAGIC 28 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 29 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 30 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 31 | # MAGIC 32 | # MAGIC ## Real Time Deployment Options 33 | # MAGIC 34 | # MAGIC * **Databricks Integrated Serving Endpoints**: These endpoints offer a comprehensive solution for both prototyping and production deployment of models. They are designed to manage real-time requests through REST APIs. We are going to cover this approach in the notebook. 35 | # MAGIC 36 | # MAGIC ### Additional options 37 | # MAGIC 38 | # MAGIC MLflow integrates seamlessly with managed services across various cloud platforms if your intent is to use cloud specific model serving capabilities: 39 | # MAGIC 40 | # MAGIC - **Azure ML**: For Microsoft Azure 41 | # MAGIC - **SageMaker**: For AWS 42 | # MAGIC - **Vertex AI**: For Google Cloud Platform 43 | # MAGIC 44 | # MAGIC ### Custom Deployments 45 | # MAGIC 46 | # MAGIC If you're seeking a more custom deployment, you can: 47 | # MAGIC 48 | # MAGIC - Export the model from the Model Registry as a Python pickle file. 49 | # MAGIC - Create your own Flask application to serve the model. 50 | # MAGIC 51 | # MAGIC **Note**: This custom approach often leverages containerization technologies like Docker or orchestration solutions like Kubernetes. 52 | # MAGIC 53 | # MAGIC 54 | 55 | # COMMAND ---------- 56 | 57 | # MAGIC %md 58 | # MAGIC ### Databricks Serving Endpoint 59 | # MAGIC We will use the model for our Bank Customer Churn prediction that we enabled serving for through the UI. On the serving page you can find code snippets that show you exactly how to call the deployed model. Here we are going to dynamically generate the URI for the deployed model so that you can execute this code in your workspace without change. 60 | 61 | # COMMAND ---------- 62 | 63 | # get token from notebook 64 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) 65 | 66 | #create authorization header for REST calls 67 | headers = { 68 | "Authorization": f"Bearer {token}", 69 | "Content-Type": "application/json" 70 | } 71 | 72 | 73 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection 74 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags() 75 | 76 | # This object comes from the Java CM 77 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags) 78 | 79 | # extract the databricks instance (domain name) from the dictionary 80 | instance = tags["browserHostName"] 81 | 82 | # COMMAND ---------- 83 | 84 | # MAGIC %md Defining a function called `score_model` that will pass JSON string as input to the model and get response back. 85 | 86 | # COMMAND ---------- 87 | 88 | # Import the requests library for HTTP communication 89 | import requests 90 | 91 | #change the model_serving_endpoint_name to the one you have given. 92 | model_serving_endpoint_name = "churn_prediction" 93 | 94 | # Define the function 'score_model' which takes a dictionary as an input 95 | def score_model(data_json: str): 96 | 97 | # Construct the URL for the model serving endpoint 98 | url = f"https://{instance}/serving-endpoints/{model_serving_endpoint_name}/invocations" 99 | 100 | # Make an HTTP POST request to score the model 101 | response = requests.request(method="POST", headers=headers, url=url, data=data_json) 102 | 103 | # Check if the request was successful (HTTP status code 200) 104 | if response.status_code != 200: 105 | # If not, raise an exception detailing the failure 106 | raise Exception(f"Request failed with status {response.status_code}, {response.text}") 107 | 108 | # Return the JSON response from the model scoring endpoint 109 | return response.json() 110 | 111 | # COMMAND ---------- 112 | 113 | #reading a sample of raw data 114 | raw_data_spark_df = spark.table("bank_churn_analysis.raw_data") 115 | 116 | input_cols = [col for col in raw_data_spark_df.columns if col not in {'RowNumber', 'CustomerId', 'Surname', 'Exited'}] 117 | 118 | #drop the columns that will not be send to model as input 119 | raw_data_spark_df = raw_data_spark_df.select(*[input_cols]) 120 | 121 | pandas_df = raw_data_spark_df.toPandas() 122 | #convert to pandas dataframe 123 | 124 | #lets take 2 sample records to use as input for our serving endpoint 125 | input_examples_df_records = pandas_df[:2] 126 | input_examples_df_records 127 | 128 | # COMMAND ---------- 129 | 130 | # MAGIC %md 131 | # MAGIC ### DataFrame Records Format 132 | # MAGIC ####Overview 133 | # MAGIC The DataFrame Records format is useful when the data can be readily represented as a Pandas DataFrame. In this approach, the DataFrame is serialized into a list of dictionaries, with each dictionary corresponding to a row in the DataFrame. 134 | # MAGIC 135 | # MAGIC ####Pros and Cons 136 | # MAGIC - __Pros__: This format is easier to read and is more human-friendly. 137 | # MAGIC - __Cons__: It consumes more bandwidth because the column names are repeated for each record. 138 | # MAGIC 139 | # MAGIC #### Use Case 140 | # MAGIC This format is preferable when you need to send DataFrame-like data, and readability is a priority. 141 | 142 | # COMMAND ---------- 143 | 144 | # Serialize using json 145 | import json 146 | serialized_data = json.dumps({"dataframe_records": input_examples_df_records.to_dict('records')}, indent=4) 147 | print(serialized_data) 148 | score_model(serialized_data) 149 | 150 | # COMMAND ---------- 151 | 152 | # MAGIC %md 153 | # MAGIC 154 | # MAGIC ### DataFrame Split Format 155 | # MAGIC 156 | # MAGIC #### Overview 157 | # MAGIC 158 | # MAGIC This format represents a Pandas DataFrame in a split orientation, separating the columns, index, and data into different keys. This is a more bandwidth-efficient alternative to the records orientation. 159 | # MAGIC 160 | # MAGIC #### Pros and Cons 161 | # MAGIC 162 | # MAGIC - __Pros__: This format is more bandwidth-efficient as compared to the records orientation. 163 | # MAGIC - __Cons__: It is a bit less intuitive to read. 164 | # MAGIC 165 | # MAGIC #### Use Case 166 | # MAGIC 167 | # MAGIC This format is useful when sending DataFrame-like data, and bandwidth or payload size is a concern. 168 | 169 | # COMMAND ---------- 170 | 171 | serialized_data = json.dumps({"dataframe_split": input_examples_df_records.to_dict('split')}, indent=4) 172 | print(serialized_data) 173 | score_model(serialized_data) 174 | -------------------------------------------------------------------------------- /Chapter-09/data/datagen.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ### Month 1 - Base line Data 5 | # MAGIC 6 | # MAGIC We will generate a dummy dataset for showcasing model drift. The dataset consists of time series data for 3 months. 7 | # MAGIC 8 | # MAGIC The independent features of the dataset include the following features: 9 | # MAGIC 10 | # MAGIC **Features** 11 | # MAGIC * `Temperature` (Numeric) : Highest daily temperature in Fahrenheit. 12 | # MAGIC * `Weather_Condition` (Categorical): 'sunny', 'cloudy', 'rainy' 13 | # MAGIC * `Promotion_Type` (Categorical): 'discount', 'free_gift', 'bundle_deal' 14 | # MAGIC * `Website_Traffic` (Numeric): Total website traffic 15 | # MAGIC * `Device_Type` (Categorical): 16 | # MAGIC 17 | # MAGIC **Target** 18 | # MAGIC * `Daily_Sales` (Numeric): 19 | # MAGIC 20 | # MAGIC The `Daily_Sales` target will have following correlation with various features" 21 | # MAGIC * `Positive correlation` with `Temperature` and `Website_Traffic`. 22 | # MAGIC * `Negative correlation` with `Weather_Condition` and `Device_Type`. 23 | # MAGIC 24 | # MAGIC We will train our model on the first month worth of data and then simulate various drift patterns in the consecutive months of data. 25 | # MAGIC 26 | 27 | # COMMAND ---------- 28 | 29 | import numpy as np 30 | import pandas as pd 31 | import matplotlib.pyplot as plt 32 | 33 | # Set random seed for reproducibility 34 | np.random.seed(0) 35 | 36 | # Generate dates for the time series data 37 | dates = pd.date_range('2023-01-01', '2023-01-31') 38 | num_days = len(dates) 39 | # Generate independent feature data 40 | temperature = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2) 41 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2]) 42 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3]) 43 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int) # Generate website traffic as integers 44 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1]) 45 | 46 | # Generate dependent feature data (daily sales) 47 | # Add positive correlation with temperature and website_traffic 48 | # Add negative correlation with weather_condition and device_type 49 | sales = np.round(1000 + 10*temperature + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')).astype(int) 50 | 51 | # Create a pandas DataFrame to store the time series data 52 | sales_data_month1 = pd.DataFrame({'Date': dates, 53 | 'Temperature': temperature, 54 | 'Weather_Condition': weather_condition, 55 | 'Promotion_Type': promotion_type, 56 | 'Website_Traffic': website_traffic, 57 | 'Device_Type': device_type, 58 | 'Daily_Sales': sales}) 59 | 60 | 61 | # COMMAND ---------- 62 | 63 | # MAGIC %md 64 | # MAGIC 65 | # MAGIC ### Month 2 - New Data Arrives 66 | # MAGIC 67 | # MAGIC Our model has been deployed for a month and we now have an incoming fresh month of data. 68 | # MAGIC 69 | # MAGIC **Scenario:** 70 | # MAGIC * An updated upstream Data cleaning process has a bug causing the the value of, `website_traffic` counts for promotion type `bundle_deal` and `free_gift` to be empty. 71 | # MAGIC 72 | # MAGIC * Also during the upstream data generation procedure a the temperature values are now being captured in __Fahrenheit__ rather than in __Celcius__. 73 | # MAGIC 74 | # MAGIC **What are we simulating here?** 75 | # MAGIC * Feature drift 76 | # MAGIC * Upstream data errors 77 | 78 | # COMMAND ---------- 79 | 80 | # Generate dates for the time series data 81 | dates = pd.date_range('2023-02-01', '2023-02-28') 82 | num_days = len(dates) 83 | 84 | # introducing feature drift 85 | # Generate independent feature data 86 | temperature_celcicus = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2) 87 | 88 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2]) 89 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3]) 90 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int) # Generate website traffic as integers 91 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1]) 92 | 93 | # Generate dependent feature data (daily sales) 94 | # Add positive correlation with temperature and website_traffic 95 | # Add negative correlation with weather_condition and device_type 96 | sales = np.round(1000 + 10*temperature_celcicus + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')).astype(int) 97 | 98 | # Create a pandas DataFrame to store the time series data 99 | sales_data_month2_correct = pd.DataFrame({'Date': dates, 100 | 'Temperature': temperature_celcicus, 101 | 'Weather_Condition': weather_condition, 102 | 'Promotion_Type': promotion_type, 103 | 'Website_Traffic': website_traffic, 104 | 'Device_Type': device_type, 105 | 'Daily_Sales': sales}) 106 | 107 | 108 | #change temperature scale to Fehrenheit 109 | #Convert the Celsius temperatures to Fahrenheit 110 | temperature_fahrenheit = (temperature_celcicus * 9 / 5) + 32 111 | 112 | 113 | # Create a pandas DataFrame to store the time series data 114 | sales_data_month2_wrong = pd.DataFrame({'Date': dates, 115 | 'Temperature': temperature_fahrenheit, 116 | 'Weather_Condition': weather_condition, 117 | 'Promotion_Type': promotion_type, 118 | 'Website_Traffic': website_traffic, 119 | 'Device_Type': device_type, 120 | 'Daily_Sales': sales}) 121 | 122 | #introducing upstream processing error causing website traffic to be empty for bundle_deal and free_gift 123 | sales_data_month2_wrong.loc[sales_data_month2_wrong['Promotion_Type'] == 'bundle_deal', 'Website_Traffic'] = None 124 | sales_data_month2_wrong.loc[ sales_data_month2_wrong['Promotion_Type'] == 'free_gift', 'Website_Traffic'] = None 125 | 126 | sales_data_month2_wrong.to_csv(f'/dbfs{raw_month2_bad_data_path}/data.csv', index=False) 127 | 128 | # COMMAND ---------- 129 | 130 | #sales_data_month2_correct 131 | 132 | # COMMAND ---------- 133 | 134 | # MAGIC %md 135 | # MAGIC ### Month 3 136 | # MAGIC 137 | # MAGIC **Scenario:** 138 | # MAGIC * A product campaign went viral on social media. Sales increased by 30% for each day. 139 | # MAGIC 140 | # MAGIC **What are we simulating here?** 141 | # MAGIC * Concept Drift 142 | 143 | # COMMAND ---------- 144 | 145 | dates = pd.date_range('2023-03-01', '2023-03-31') 146 | num_days = len(dates) 147 | 148 | # Generate independent feature data 149 | temperature = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2) 150 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2]) 151 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3]) 152 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int) # Generate website traffic as integers 153 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1]) 154 | 155 | #increase daily sales by 30% 156 | sales = np.round((1000 - 10*temperature + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')) * 1.3).astype(int) 157 | 158 | # Create a pandas DataFrame to store the time series data 159 | sales_data_month3 = pd.DataFrame({'Date': dates, 160 | 'Temperature': temperature, 161 | 'Weather_Condition': weather_condition, 162 | 'Promotion_Type': promotion_type, 163 | 'Website_Traffic': website_traffic, 164 | 'Device_Type': device_type, 165 | 'Daily_Sales': sales}) 166 | 167 | 168 | #sales_data_month3 169 | 170 | # COMMAND ---------- 171 | 172 | merged_raw_df = pd.concat([sales_data_month1, sales_data_month2_correct, sales_data_month3]) 173 | # Write the dataframe to a CSV file and give path to dbfs directory we created for storing the raw file. 174 | merged_raw_df.to_csv(f'/dbfs{raw_good_data_path}/data.csv', index=False) 175 | -------------------------------------------------------------------------------- /Chapter-04/mlflow-with-featurestore.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # MLflow introduction. 3 | # MAGIC 4 | # MAGIC This tutorial covers an example of how to use the integrated MLflow tracking capabilities to track your model training with the integrated feature store. 5 | # MAGIC - Import data that was previously registered in the feature store table. 6 | # MAGIC - Create a baseline model for churn prediction and store it in the integrated MLflow tracking server. 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC ###0. SETUP -- Databricks Spark cluster: 12 | # MAGIC 13 | # MAGIC 1. **Create** a cluster by... 14 | # MAGIC - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 15 | # MAGIC - In `Policy` select `Unrestricted`. 16 | # MAGIC - Enter any text, i.e `demo` into the cluster name text box. 17 | # MAGIC - Select `Single Node` in the cluster mode. 18 | # MAGIC - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab. 19 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 20 | # MAGIC - Click the `create cluster` button and wait for your cluster to be provisioned 21 | # MAGIC 3. **Attach** this notebook to your cluster by... 22 | # MAGIC - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 23 | 24 | # COMMAND ---------- 25 | 26 | #install latest version of sklearn 27 | %pip install -U scikit-learn 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %md 32 | # MAGIC ### Step 1) Importing the desired libraries and defining few constants and creating training set from the registered feature table. 33 | 34 | # COMMAND ---------- 35 | 36 | from databricks.feature_store import FeatureStoreClient 37 | from databricks.feature_store import FeatureLookup 38 | import typing 39 | 40 | from sklearn import metrics 41 | from sklearn.ensemble import RandomForestClassifier 42 | from sklearn.model_selection import train_test_split 43 | import mlflow 44 | import pandas as pd 45 | 46 | # COMMAND ---------- 47 | 48 | #Name of the model 49 | MODEL_NAME = "random_forest_classifier_featurestore" 50 | #This is the name for the entry in model registry 51 | MODEL_REGISTRY_NAME = "Bank_Customer_Churn" 52 | #The email you use to authenticate in the Databricks workspace 53 | USER_EMAIL = "debu.sinha@databricks.com" 54 | #Location where the MLflow experiement will be listed in user workspace 55 | EXPERIMENT_NAME = f"/Users/{USER_EMAIL}/Bank_Customer_Churn_Analysis" 56 | # we have all the features backed into a Delta table so we will read directly 57 | FEATURE_TABLE = "bank_churn_analysis.bank_customer_features" 58 | 59 | 60 | # COMMAND ---------- 61 | 62 | 63 | # this code is just for demonstration and you can utilize this as starting point and build more errorhandling around it. 64 | class Feature_Lookup_Input_Tuple(typing.NamedTuple): 65 | fature_table_name: str 66 | feature_list: typing.Union[typing.List[str], None] 67 | lookup_key: typing.List[str] 68 | 69 | # this code is going to generate feature look up based on on the list of feature mappings provided. 70 | def generate_feature_lookup(feature_mapping: typing.List[Feature_Lookup_Input_Tuple]) -> typing.List[FeatureLookup]: 71 | lookups = [] 72 | for fature_table_name, feature_list, lookup_key in feature_mapping: 73 | lookups.append( 74 | FeatureLookup( 75 | table_name = fature_table_name, 76 | feature_names = feature_list, 77 | lookup_key = lookup_key 78 | ) 79 | ) 80 | return lookups 81 | 82 | 83 | # COMMAND ---------- 84 | 85 | # MAGIC %md 86 | # MAGIC ### Step 2) Build a simplistic model that uses the feature store table as its source for training and validation. 87 | 88 | # COMMAND ---------- 89 | 90 | #initialize the feature store client 91 | fs = FeatureStoreClient() 92 | mlflow.set_experiment(EXPERIMENT_NAME) 93 | 94 | with mlflow.start_run(): 95 | TEST_SIZE = 0.20 96 | 97 | #define the list of features we want to get from feature table 98 | #If we have to combine data from multiple feature tables then we can provide multiple mappings for feature tables 99 | features = [Feature_Lookup_Input_Tuple(FEATURE_TABLE,["CreditScore" , "Age", "Tenure",\ 100 | "Balance", "NumOfProducts", "HasCrCard",\ 101 | "IsActiveMember", "EstimatedSalary", "Geography_Germany",\ 102 | "Geography_Spain", "Gender_Male"], ["CustomerId"] )] 103 | 104 | lookups = generate_feature_lookup(features) 105 | 106 | #Now we will simulate receiving only ID's of customers and the label as input at the time of inference 107 | training_df = spark.table(FEATURE_TABLE).select("CustomerId", "Exited") 108 | 109 | #Using the training set we will combine the training dataframe with the features stored in the feature tables. 110 | training_data = fs.create_training_set( 111 | df=training_df, 112 | feature_lookups=lookups, 113 | label="Exited", 114 | exclude_columns=['CustomerId'] 115 | ) 116 | 117 | #convert the dataset to pandas so that we can fit sklearn RandomForestClassifier on it 118 | train_df = training_data.load_df().toPandas() 119 | 120 | #The train_df represents the input dataframe that has all the feature columns along with the new raw input in the form of training_df. 121 | X = train_df.drop(['Exited'], axis=1) 122 | y = train_df['Exited'] 123 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=54, stratify=y) 124 | 125 | #here we will are not doing any hyperparameter tuning however, in future we will see how to perform hyperparameter tuning in scalable manner on Databricks. 126 | model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train) 127 | signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train)) 128 | 129 | predictions = model.predict(X_test) 130 | fpr, tpr, _ = metrics.roc_curve(y_test, predictions, pos_label=1) 131 | auc = metrics.auc(fpr, tpr) 132 | accuracy = metrics.accuracy_score(y_test, predictions) 133 | 134 | #get the calculated feature importances. 135 | importances = dict(zip(model.feature_names_in_, model.feature_importances_)) 136 | #log artifact 137 | mlflow.log_dict(importances, "feature_importances.json") 138 | #log metrics 139 | mlflow.log_metric("auc", auc) 140 | mlflow.log_metric("accuracy", accuracy) 141 | #log parameters 142 | mlflow.log_param("split_size", TEST_SIZE) 143 | mlflow.log_params(model.get_params()) 144 | #set tag 145 | mlflow.set_tag(MODEL_NAME, "mlflow and feature store demo") 146 | #log the model itself in mlflow tracking server 147 | mlflow.sklearn.log_model(model, MODEL_NAME, signature=signature, input_example=X_train.iloc[:4, :]) 148 | 149 | # finally to make the feature store track what features are being used by our model we call log_model with the feature store client 150 | fs.log_model( 151 | model, 152 | MODEL_NAME, 153 | flavor=mlflow.sklearn, 154 | training_set=training_data, 155 | registered_model_name=MODEL_REGISTRY_NAME 156 | ) 157 | 158 | 159 | 160 | # COMMAND ---------- 161 | 162 | # MAGIC %md 163 | # MAGIC ### Step 3) Now that we have the model logged to the MLflow tracking server, we can get the latest version from the experiment and use it. 164 | 165 | # COMMAND ---------- 166 | 167 | from mlflow.tracking import MlflowClient 168 | #initialize the mlflow client 169 | client = MlflowClient() 170 | #get the experiment id 171 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id 172 | #get the latest run id which will allow us to directly access the metrics, and attributes and all th einfo 173 | run_id = mlflow.search_runs(experiment_id, order_by=["start_time DESC"]).head(1)["run_id"].values[0] 174 | 175 | # COMMAND ---------- 176 | 177 | # MAGIC %md 178 | # MAGIC - With the feature store registration associated with the MLflow model, we don't have to specify any data loading and processing to happen other than a point to the raw data that features will be calculated from. 179 | # MAGIC - We can do batch predictions simply by accessing the feature store instance, providing the run_id and the model's name (MODEL_NAME below) with the raw data specified as the second argument. 180 | # MAGIC - If we want to provide new values for certain feature that is already part of the feature table, just include it in the new dataframe that we want to perform the prediction on. 181 | 182 | # COMMAND ---------- 183 | 184 | #at the time of infernce you can provide just the CustomerId. This is the key that will perform all the lookup for the features automatically. 185 | predictions = fs.score_batch(f"runs:/{run_id}/{MODEL_NAME}", spark.table(FEATURE_TABLE).select("CustomerId")) 186 | 187 | # COMMAND ---------- 188 | 189 | display(predictions) 190 | 191 | # COMMAND ---------- 192 | 193 | # MAGIC %md 194 | # MAGIC ##Cleanup 195 | 196 | # COMMAND ---------- 197 | 198 | #Uncomment to lines below and execute for cleaning up. 199 | ''' 200 | from mlflow.tracking import MlflowClient 201 | 202 | #get all the information about the current experiment 203 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id 204 | 205 | #list all the runs that are part of this experiment and delete them 206 | runs = mlflow.list_run_infos(experiment_id=experiment_id) 207 | for run in runs: 208 | mlflow.delete_run(run_id = run.run_id) 209 | 210 | #finally delete the experiment 211 | mlflow.delete_experiment(experiment_id=experiment_id) 212 | 213 | client = MlflowClient() 214 | #delete the model registered in the registry to clear the linkage in thefeature store 215 | client.delete_registered_model(name=MODEL_REGISTRY_NAME) 216 | ''' 217 | -------------------------------------------------------------------------------- /Chapter-07/batch-and-streaming.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC %md 4 | # MAGIC ## Author 5 | # MAGIC 6 | # MAGIC - **Debu Sinha** 7 | # MAGIC 8 | # MAGIC ## Tested Environment 9 | # MAGIC 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 13 | # MAGIC 14 | # MAGIC ## Cluster Setup Instructions 15 | # MAGIC 16 | # MAGIC 1. **Create a Cluster**: 17 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 18 | # MAGIC - Under `Policy`, select `Unrestricted`. 19 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 20 | # MAGIC - In `Cluster Mode`, select `Single Node`. 21 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 22 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 23 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 24 | # MAGIC 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 26 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 27 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 28 | # MAGIC 29 | # MAGIC ## Batch Deployment 30 | # MAGIC 31 | # MAGIC This notebook will go over the most common model deployment option of batch inferencing. We will load the latest model version for our Bank customer churn prediction problem from the model registry and load it as a python function that can be applied to a Spark Dataframe. 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md ### Inference in Spark 36 | # MAGIC 37 | # MAGIC Till now we have seen how you can use differnent machine learning libraries to train your model. When it comes to deployment we can now utilize to power of Spark to distribute our trained model to more than a single node and do predictions at scale. 38 | # MAGIC 39 | # MAGIC To do this, we will use `mlflow.pyfunc.spark_udf` and pass in the `SparkSession`, name of the model, and run id. 40 | # MAGIC 41 | # MAGIC Note: Using UDF's in Spark means that supporting libraries must be installed on every node in the cluster. In the case of `sklearn`, this is installed in Databricks clusters by default. When using other libraries, you will need to install them to ensure that they will work as UDFs. 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %md 46 | # MAGIC First we will load the desired model from the model registry. 47 | 48 | # COMMAND ---------- 49 | 50 | import mlflow 51 | 52 | # the name of the model in the registry 53 | registry_model_name = "Churn Prediction Bank" 54 | 55 | # get the latest version of the model in staging and load it as a spark_udf. 56 | # MLflow easily produces a Spark user defined function (UDF). This bridges the gap between Python environments and applying models at scale using Spark. 57 | model = mlflow.pyfunc.spark_udf(spark, model_uri=f"models:/{registry_model_name}/staging") 58 | 59 | # COMMAND ---------- 60 | 61 | # MAGIC %md 62 | # MAGIC This model was trained on raw dataset and using the Databricks AutoML. 63 | # MAGIC 64 | # MAGIC Note: Make sure the dataset we want to run infrence on matches the schema of the dataset the model was trained on. In the current example we will simply reuse the dataset we used to train our model. 65 | # MAGIC - As best practice keep all the model specific transformations like imputing missing values or scaling a column value should be done as part of the model pipelne and not when registering a table as feature table. 66 | 67 | # COMMAND ---------- 68 | 69 | spark_df = spark.table("bank_churn_analysis.raw_Data") 70 | display(spark_df) 71 | 72 | # COMMAND ---------- 73 | 74 | # MAGIC %md 75 | # MAGIC Note: we will not send RowNumber, CustomerId, Surname and Exited columns to the model. 76 | 77 | # COMMAND ---------- 78 | 79 | exclude_colums = {'RowNumber', "CustomerId", "Surname", "Exited"} 80 | input_columns = [col for col in spark_df.columns if col not in exclude_colums] 81 | input_columns 82 | 83 | # COMMAND ---------- 84 | 85 | # MAGIC %md Apply the model as a standard UDF using the column names as the input to the function. 86 | 87 | # COMMAND ---------- 88 | 89 | #passing non label columns to the model as input 90 | prediction_df = spark_df.withColumn("prediction", model(*input_columns)) 91 | 92 | display(prediction_df) 93 | 94 | # COMMAND ---------- 95 | 96 | # MAGIC %md 97 | # MAGIC Now you can write the inference out to a database for fast access, to a Delta table, or any other file format depending on your application need. 98 | 99 | # COMMAND ---------- 100 | 101 | # MAGIC %md 102 | # MAGIC __Note:__ In the above example we showcased how you can use mlflow API to perform batch inference. We didnt make use of the model trained on feature table that we created in Chapter 2. If you want to utilize feature store API to log a trained model and also perform the batch inference check the notebook in Chapter 4 that has details on that. 103 | 104 | # COMMAND ---------- 105 | 106 | # MAGIC %md # Streaming Deployment 107 | 108 | # COMMAND ---------- 109 | 110 | # MAGIC %md 111 | # MAGIC We can also perform continuous model inference using a technology like Spark's Structured Streaming. you can read more about this [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html). Using Spark for ingesting and building your Streaming ingestion pipelines and model insfrence solution is that: 112 | # MAGIC - It offers the same Dataframe API to processing streaming data as you would use with batch data. 113 | # MAGIC - provides a scalable and fault tolerant way to continuously perform inference on incoming new data. 114 | # MAGIC 115 | # MAGIC We will not go into detail of Spark structured streaming here but will cover how you can deploy model for inference on a stream of data. 116 | # MAGIC 117 | # MAGIC The first is usually to connect to a streaming data source like Kafka, Azure event bus or Kinesis. Using Spark structured streaming you can also simulate reading files as stream from a cloud storage like S3. For our example we are going to do just that. 118 | # MAGIC 119 | # MAGIC We'll read Delta table as a stream. 120 | 121 | # COMMAND ---------- 122 | 123 | # right now we are just defining a streaming data source but this statement will not execute until we call an Spark action. 124 | raw_streaming_df = spark.readStream.format("delta").option("ignoreChanges", "true").table("bank_churn_analysis.raw_Data").drop(*("RowNumber", "CustomerId", "Surname", "Exited")) 125 | 126 | # if you want to read from a S3 location then use the next set of code 127 | # streaming_data = (spark 128 | # .readStream 129 | # .schema(schema) 130 | # .option("maxFilesPerTrigger", 1) 131 | # .parquet("") 132 | # .drop(*("RowNumber", "CustomerId", "Surname", "Exited"))) 133 | 134 | # COMMAND ---------- 135 | 136 | # we will use this to keep track of our streaming job 137 | stream_name = "streaming_inference" 138 | 139 | # COMMAND ---------- 140 | 141 | predictions_df = raw_streaming_df.withColumn("prediction", model(*raw_streaming_df.columns)) 142 | display(predictions_df, streamName=stream_name) 143 | 144 | # COMMAND ---------- 145 | 146 | # Spark structured stream takes some time to finish initializing and trying to shut it off will throw an error if its not active. This code will prevent it. 147 | active_streams = [stream.name for stream in spark.streams.active] 148 | active_streams 149 | 150 | import time 151 | start_time = time.time() 152 | while stream_name not in active_streams: 153 | time.sleep(5) 154 | # wait for 20 seconds to let the strem initialize 155 | if time.time()-start_time>20: 156 | # stream initialization was not kicked off or there is some network issue. 157 | break 158 | 159 | # COMMAND ---------- 160 | 161 | # We will stop the stream after reviewing results 162 | for stream in spark.streams.active: 163 | print(f"Stopping {stream.name}") 164 | stream.stop() # Stop the stream 165 | 166 | # COMMAND ---------- 167 | 168 | # MAGIC %md 169 | # MAGIC 170 | # MAGIC ### Write to Delta table 171 | 172 | # COMMAND ---------- 173 | 174 | working_dir = "/tmp" 175 | # this is important for streaming queries to keep track of what records have been processed and guyrantee each record is processed only once. 176 | checkpoint_location = f"{working_dir}/stream.checkpoint" 177 | # this is a temporary location where we will write the predictions of our model as Delta table 178 | write_path = f"{working_dir}/predictions" 179 | 180 | (predictions_df 181 | .writeStream # Write the stream 182 | .queryName(stream_name) # Name the query 183 | .format("delta") # Use the delta format 184 | .option("checkpointLocation", checkpoint_location) # Specify where to log metadata 185 | .option("path", write_path) # Specify the output path 186 | .outputMode("append") # "append" means append the new data to the table 187 | .start() # Start the operation 188 | ) 189 | 190 | # COMMAND ---------- 191 | 192 | # MAGIC %md 193 | # MAGIC we can take a look at what files are written to the file system 194 | 195 | # COMMAND ---------- 196 | 197 | # MAGIC %fs 198 | # MAGIC ls /tmp/predictions/ 199 | 200 | # COMMAND ---------- 201 | 202 | # MAGIC %sql 203 | # MAGIC select * from delta.`/tmp/predictions` 204 | 205 | # COMMAND ---------- 206 | 207 | # We will stop the stream after writing the data to the delta table 208 | for stream in spark.streams.active: 209 | print(f"Stopping {stream.name}") 210 | stream.stop() # Stop the stream 211 | -------------------------------------------------------------------------------- /Chapter-03/churn-analysis.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC * [**Customer Churn**](https://en.wikipedia.org/wiki/Customer_attrition) also known as Customer attrition, customer turnover, or customer defection, is the loss of clients or customers and is... 4 | # MAGIC * Built on top of Databricks Platform 5 | # MAGIC * Uses Databricks ML runtime and Feature store 6 | # MAGIC * This Notebook... 7 | # MAGIC * We will use Customer Churn dataset from the [Kaggle](https://www.kaggle.com/mathchi/churn-for-bank-customers). 8 | # MAGIC * We will skip the EDA part and focus on the feature engineering part and registering feature tables into Databricks feature store. 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %md 13 | # MAGIC ###0. SETUP -- Databricks Spark cluster: 14 | # MAGIC 15 | # MAGIC 1. **Create** a cluster by... 16 | # MAGIC - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 17 | # MAGIC - In `Policy` select `Unrestricted`. 18 | # MAGIC - Enter any text, i.e `demo` into the cluster name text box. 19 | # MAGIC - Select `Single Node` in the cluster mode. 20 | # MAGIC - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab. 21 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 22 | # MAGIC - Click the `create cluster` button and wait for your cluster to be provisioned 23 | # MAGIC 3. **Attach** this notebook to your cluster by... 24 | # MAGIC - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md 29 | # MAGIC ###Step1: Ingest Data to Notebook 30 | # MAGIC 31 | # MAGIC We will download the dataset hosted at [**Kaggle**](https://www.kaggle.com/mathchi/churn-for-bank-customers) 32 | # MAGIC 33 | # MAGIC ## Content 34 | # MAGIC * `RowNumber` —corresponds to the record (row) number and has no effect on the output. 35 | # MAGIC * `CustomerId` -contains random values and has no effect on customer leaving the bank. 36 | # MAGIC * `Surname` —the surname of a customer has no impact on their decision to leave the bank. 37 | # MAGIC * `CreditScore` —can have an effect on customer churn, since a customer with a higher credit score is less likely to leave the bank. 38 | # MAGIC * `Geography` —a customer’s location can affect their decision to leave the bank. 39 | # MAGIC * `Gender` —it’s interesting to explore whether gender plays a role in a customer leaving the bank 40 | # MAGIC * `Age` —this is certainly relevant, since older customers are less likely to leave their bank than younger ones. 41 | # MAGIC * `Tenure` —refers to the number of years that the customer has been a client of the bank. Normally, older clients are more loyal and less likely to leave a bank 42 | # MAGIC * `Balance` —also a very good indicator of customer churn, as people with a higher balance in their accounts are less likely to leave the bank compared to those with lower balances. 43 | # MAGIC * `NumOfProducts` —refers to the number of products that a customer has purchased through the bank. 44 | # MAGIC * `HasCrCard` —denotes whether or not a customer has a credit card. This column is also relevant, since people with a credit card are less likely to leave the bank. 45 | # MAGIC * `IsActiveMember` —active customers are less likely to leave the bank 46 | # MAGIC * `EstimatedSalary` —as with balance, people with lower salaries are more likely to leave the bank compared to those with higher salaries. 47 | # MAGIC * `Exited` —whether or not the customer left the bank. 48 | # MAGIC 49 | # MAGIC ## Acknowledgements 50 | # MAGIC 51 | # MAGIC As we know, it is much more expensive to sign in a new client than keeping an existing one. 52 | # MAGIC It is advantageous for banks to know what leads a client towards the decision to leave the company. 53 | # MAGIC Churn prevention allows companies to develop loyalty programs and retention campaigns to keep as many customers as possible. 54 | # MAGIC 55 | # MAGIC Data= https://www.kaggle.com/mathchi/churn-for-bank-customers 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %md 60 | # MAGIC ## Import Data 61 | # MAGIC 62 | # MAGIC Next, we'll import our data for this part 63 | 64 | # COMMAND ---------- 65 | 66 | #read more about reading files from Databricks repos at https://docs.databricks.com/repos.html#access-files-in-a-repo-programmatically 67 | import os 68 | bank_df = spark.read.option("header", True).option("inferSchema", True).csv(f"file:{os.getcwd()}/data/churn.csv") 69 | display(bank_df) 70 | 71 | # COMMAND ---------- 72 | 73 | # MAGIC %md 74 | # MAGIC We can drop RowNumber in the feature engineering step as this is not adding any valuable information. 75 | # MAGIC 76 | # MAGIC **Note:** 77 | # MAGIC Databricks introduced a built in data profiler for spark dataframes. The built in function display now gives an option to profile data automatically 78 | 79 | # COMMAND ---------- 80 | 81 | display(bank_df) 82 | 83 | # COMMAND ---------- 84 | 85 | # MAGIC %md Lets get unique value count in Surname 86 | 87 | # COMMAND ---------- 88 | 89 | bank_df.select('Surname').distinct().count() 90 | 91 | # COMMAND ---------- 92 | 93 | # MAGIC %md 94 | # MAGIC As we can see Surname column have a lot of unique values and is not adding any useful information for us so we will drop it in our feature engineering step. 95 | 96 | # COMMAND ---------- 97 | 98 | # MAGIC %md 99 | # MAGIC ## Create Feature Table 100 | # MAGIC 101 | # MAGIC Next, we can use the DataFrame **`bank_df`** to create a feature table using Feature Store. 102 | # MAGIC 103 | # MAGIC **In order to write our features out as a feature table we will perform the following steps:** 104 | # MAGIC 1. Create a Database that will store any feature table. In our case let that be `bank_churn_analysis` 105 | # MAGIC 1. Write the Python functions to compute the features. The output of each function should be an Apache Spark DataFrame with a unique primary key. The primary key can consist of one or more columns. 106 | # MAGIC 1. Create a feature table by instantiating a FeatureStoreClient and using create_table (Databricks Runtime 10.2 ML or above) or create_feature_table (Databricks Runtime 10.1 ML or below). 107 | # MAGIC 1. Populate the feature table using write_table. 108 | # MAGIC 109 | # MAGIC Note: 110 | # MAGIC - **If you want to prevent any data leakage you would want to consider not performing OHE or any feature treatment at the time of registering dataset as a feature table. ** 111 | 112 | # COMMAND ---------- 113 | 114 | # MAGIC %md 115 | # MAGIC ## 1. Defining a database to store feature tables. 116 | 117 | # COMMAND ---------- 118 | 119 | DATABASE_NAME = "bank_churn_analysis" 120 | #setup database that will hold our Feature tables in Delta format. 121 | spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}") 122 | 123 | # COMMAND ---------- 124 | 125 | # MAGIC %md 126 | # MAGIC write the raw data out as a delta table 127 | 128 | # COMMAND ---------- 129 | 130 | bank_df.write.format("delta").mode("overwrite").saveAsTable(f"{DATABASE_NAME}.raw_data") 131 | 132 | # COMMAND ---------- 133 | 134 | # MAGIC %md 135 | # MAGIC ## 2. Defining a feature engineering function that will return a Spark dataframe with a unique primary key. 136 | # MAGIC In our case it is the `CustomerId`. 137 | 138 | # COMMAND ---------- 139 | 140 | # MAGIC %md 141 | # MAGIC 142 | # MAGIC The `bank_df` DataFrame is already pretty clean, but we do have some nominal features that we'll need to convert to numeric features for modeling. 143 | # MAGIC 144 | # MAGIC These features include: 145 | # MAGIC 146 | # MAGIC * **`Geography`** 147 | # MAGIC * **`Gender`** 148 | # MAGIC 149 | # MAGIC We will also be dropping few features which dont add additional value for our model: 150 | # MAGIC * **`RowNumber`** 151 | # MAGIC * **`Surname`** 152 | # MAGIC 153 | # MAGIC ### Create `compute_features` Function 154 | # MAGIC 155 | # MAGIC A lot of data scientists are familiar with Pandas DataFrames, so we'll use the [pyspark.pandas](https://spark.apache.org/docs/3.2.0/api/python/user_guide/pandas_on_spark/) library to one-hot encode these categorical features. 156 | # MAGIC 157 | # MAGIC **Note:** we are creating a function to perform these computations. We'll use it to refer to this set of instructions when creating our feature table. 158 | 159 | # COMMAND ---------- 160 | 161 | 162 | import pyspark.pandas as ps 163 | import numpy as np 164 | 165 | def compute_features(spark_df): 166 | # https://spark.apache.org/docs/latest/api/python/migration_guide/koalas_to_pyspark.html?highlight=dataframe%20pandas_api 167 | # Convert to pyspark.pandas DataFrame 168 | ps_df = spark_df.pandas_api() 169 | 170 | # Drop RowNumber & Surname column 171 | ps_df = ps_df.drop(['RowNumber', 'Surname'], axis=1) 172 | 173 | # One-Hot Encoding for Geography and Gender 174 | ohe_ps_df = ps.get_dummies( 175 | ps_df, 176 | columns=["Geography", "Gender"], 177 | dtype="int", 178 | drop_first=True 179 | ) 180 | 181 | # Clean up column names 182 | ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r' ', '', regex=True) 183 | ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r'(', '-', regex=True) 184 | ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r')', '', regex=True) 185 | 186 | ## Additional example feature engineering steps 187 | 188 | # # Create a binary feature indicating whether the balance is zero or not 189 | # ohe_ps_df['Is_Balance_Zero'] = (ohe_ps_df['Balance'] == 0).astype('int') 190 | 191 | # # Ratio of Tenure to Age 192 | # ohe_ps_df['Tenure_to_Age'] = ohe_ps_df['Tenure'] / ohe_ps_df['Age'] 193 | 194 | # # Interaction feature: Balance to EstimatedSalary ratio 195 | # ohe_ps_df['Balance_to_Salary'] = ohe_ps_df['Balance'] / ohe_ps_df['EstimatedSalary'] 196 | 197 | return ohe_ps_df 198 | 199 | 200 | # COMMAND ---------- 201 | 202 | # MAGIC %md 203 | # MAGIC ### Compute Features 204 | # MAGIC 205 | # MAGIC Next, we can use our featurization function `compute_features` to create create a DataFrame of our features. 206 | 207 | # COMMAND ---------- 208 | 209 | bank_features_df = compute_features(bank_df) 210 | display(bank_features_df) 211 | 212 | # COMMAND ---------- 213 | 214 | # MAGIC %md 215 | # MAGIC ##3. Create the Feature Table 216 | # MAGIC 217 | # MAGIC Next, we can use the `feature_table` operation to register the DataFrame as a Feature Store table. 218 | # MAGIC 219 | # MAGIC In order to do this, we'll want the following details: 220 | # MAGIC 221 | # MAGIC 1. The `name` of the database and table where we want to store the feature table 222 | # MAGIC 1. The `keys` for the table 223 | # MAGIC 1. The `schema` of the table 224 | # MAGIC 1. A `description` of the contents of the feature table 225 | # MAGIC 1. `partition_columns`- Column(s) used to partition the feature table. 226 | # MAGIC 1. `features_df`(optional) - Data to insert into this feature table. The schema of features_df will be used as the feature table schema. 227 | # MAGIC 228 | # MAGIC **Note:** 229 | # MAGIC 1. This creates our feature table, but we still need to write our values in the DataFrame to the table. 230 | 231 | # COMMAND ---------- 232 | 233 | #Our first step is to instantiate the feature store client using `FeatureStoreClient()`. 234 | from databricks.feature_store import FeatureStoreClient 235 | fs = FeatureStoreClient() 236 | 237 | # COMMAND ---------- 238 | 239 | # MAGIC %md 240 | # MAGIC We have __2__ options to initialize a feature table. 241 | # MAGIC 242 | # MAGIC 1. Providing Dataframe to populate feature table at time of defining feature table. This approach can be used when you have a feature dataframe ready to instantiate a feature table. 243 | # MAGIC ``` 244 | # MAGIC bank_feature_table = fs.create_table( 245 | # MAGIC name=f"{DATABASE_NAME}.bank_customer_features", # the name of the feature table 246 | # MAGIC primary_keys=["CustomerId"], # primary key that will be used to perform joins 247 | # MAGIC schema=bank_features_df.spark.schema(), # the schema of the Feature table 248 | # MAGIC description="This customer level table contains one-hot encoded categorical and scaled numeric features to predict bank customer churn.", 249 | # MAGIC feature_df=bank_features_df.to_spark() 250 | # MAGIC ) 251 | # MAGIC ``` 252 | # MAGIC 2. In second case you can provide definition of the feature table without providing a source dataframe. This approach can be used when your data to populate feature store will be ingested at a different time then when you are defining the feature table. We will be showcasing this approach as part of the notebook. 253 | 254 | # COMMAND ---------- 255 | 256 | bank_feature_table = fs.create_table( 257 | name=f"{DATABASE_NAME}.bank_customer_features", # the name of the feature table 258 | primary_keys=["CustomerId"], # primary key that will be used to perform joins 259 | schema=bank_features_df.spark.schema(), # the schema of the Feature table 260 | description="This customer level table contains one-hot encoded categorical and scaled numeric features to predict bank customer churn." 261 | ) 262 | 263 | # COMMAND ---------- 264 | 265 | # MAGIC %md 266 | # MAGIC ## 4. Populate the feature table using write_table. 267 | # MAGIC Now, we can write the records from **`bank_features_df`** to the feature table. 268 | 269 | # COMMAND ---------- 270 | 271 | fs.write_table(df=bank_features_df.to_spark(), name=f"{DATABASE_NAME}.bank_customer_features", mode="overwrite") 272 | #instead of overwrite you can choose "merge" as an option if you want to update only certain records. 273 | 274 | # COMMAND ---------- 275 | 276 | # MAGIC %md 277 | # MAGIC ##5. Browsing the Feature Store 278 | # MAGIC 279 | # MAGIC The tables are now visible and searchable in the [Feature Store](/#feature-store/feature-store) 280 | 281 | # COMMAND ---------- 282 | 283 | # MAGIC %md 284 | # MAGIC Optionally if your usecase requires joining features for real time inference, you can write your features out to an [online store](https://docs.databricks.com/applications/machine-learning/feature-store.html#publish-features-to-an-online-feature-store). 285 | # MAGIC 286 | # MAGIC And finally, we can perform Access Control using built-in features in the Feature Store UI. 287 | 288 | # COMMAND ---------- 289 | 290 | # MAGIC %md 291 | # MAGIC ### Cleanup 292 | 293 | # COMMAND ---------- 294 | 295 | #Drop feature table. This will drop the underlying Delta table as well. 296 | 297 | # fs.drop_table( 298 | # name=f"{DATABASE_NAME}.bank_customer_features" 299 | # ) 300 | 301 | # COMMAND ---------- 302 | 303 | # MAGIC %md 304 | # MAGIC Note: In you decide to drop table from UI follow the follwing steps.. 305 | # MAGIC 306 | # MAGIC Follow the following steps: 307 | # MAGIC - Go to [Feature Store](/#feature-store/feature-store) 308 | # MAGIC - Select the feature tables and select `delete` after clicking on 3 vertical dots icon. 309 | # MAGIC 310 | # MAGIC Deleting the feature tables in this way requires you to manually delete the published online tables and the underlying Delta table separately. 311 | 312 | # COMMAND ---------- 313 | 314 | 315 | -------------------------------------------------------------------------------- /Chapter-07/real-time-additional.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ## Author 4 | # MAGIC 5 | # MAGIC - **Debu Sinha** 6 | # MAGIC 7 | # MAGIC ## Tested Environment 8 | # MAGIC 9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 12 | # MAGIC 13 | # MAGIC ## Cluster Setup Instructions 14 | # MAGIC 15 | # MAGIC 1. **Create a Cluster**: 16 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 17 | # MAGIC - Under `Policy`, select `Unrestricted`. 18 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 19 | # MAGIC - In `Cluster Mode`, select `Single Node`. 20 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 21 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 22 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 23 | # MAGIC 24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 25 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 26 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 27 | 28 | # COMMAND ---------- 29 | 30 | # MAGIC %pip install assertpy 31 | 32 | # COMMAND ---------- 33 | 34 | # the name of model in model registry you want to serve with serving endpoint. 35 | model_name = "Churn Prediction Bank" 36 | 37 | # serving endpoint name 38 | model_serving_endpoint_name = "churn_prediction_api_deployment" 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md 43 | # MAGIC 44 | # MAGIC 45 | # MAGIC --- 46 | # MAGIC 47 | # MAGIC # Code Documentation for Token and Header Setup in Databricks 48 | # MAGIC 49 | # MAGIC This document provides an in-depth overview of the code that fetches the API token from a Databricks notebook, sets up the authorization header for REST API calls, and retrieves the Databricks instance URL. 50 | # MAGIC 51 | # MAGIC --- 52 | # MAGIC 53 | # MAGIC ## Code Sections 54 | # MAGIC 55 | # MAGIC ### 1. Fetch API Token from Databricks Notebook 56 | # MAGIC 57 | # MAGIC #### Purpose 58 | # MAGIC 59 | # MAGIC - Fetches the Databricks API token from the current notebook's context. 60 | # MAGIC 61 | # MAGIC #### Code Explanation 62 | # MAGIC 63 | # MAGIC - `token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)` 64 | # MAGIC 65 | # MAGIC #### Libraries Used 66 | # MAGIC 67 | # MAGIC - `dbutils`: Databricks utility to interact with Databricks services. 68 | # MAGIC 69 | # MAGIC --- 70 | # MAGIC 71 | # MAGIC ### 2. Create Authorization Headers 72 | # MAGIC 73 | # MAGIC #### Purpose 74 | # MAGIC 75 | # MAGIC - Sets up the headers required for authorization and content-type in REST API calls. 76 | # MAGIC 77 | # MAGIC #### Code Explanation 78 | # MAGIC 79 | # MAGIC - `headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" }` 80 | # MAGIC 81 | # MAGIC #### Libraries Used 82 | # MAGIC 83 | # MAGIC - None. 84 | # MAGIC 85 | # MAGIC --- 86 | # MAGIC 87 | # MAGIC ### 3. Fetch Databricks Instance URL 88 | # MAGIC 89 | # MAGIC #### Purpose 90 | # MAGIC 91 | # MAGIC - Retrieves the Databricks instance URL for further API calls. 92 | # MAGIC 93 | # MAGIC #### Code Explanation 94 | # MAGIC 95 | # MAGIC 1. `java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()`: Fetches the notebook's tags as a Java object. 96 | # MAGIC 2. `tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags)`: Converts the Java tags object to a Python dictionary. 97 | # MAGIC 3. `instance = tags["browserHostName"]`: Extracts the Databricks instance (domain name) from the tags dictionary. 98 | # MAGIC 99 | # MAGIC #### Libraries Used 100 | # MAGIC 101 | # MAGIC - `dbutils`: Databricks utility. 102 | # MAGIC - `sc._jvm.scala.collection.JavaConversions`: Scala library for Java to Python type conversion. 103 | # MAGIC 104 | # MAGIC --- 105 | # MAGIC 106 | # MAGIC 107 | 108 | # COMMAND ---------- 109 | 110 | # get token from notebook 111 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) 112 | 113 | #create authorization header for REST calls 114 | headers = { 115 | "Authorization": f"Bearer {token}", 116 | "Content-Type": "application/json" 117 | } 118 | 119 | 120 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection 121 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags() 122 | 123 | # This object comes from the Java CM 124 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags) 125 | 126 | # extract the databricks instance (domain name) from the dictionary 127 | instance = tags["browserHostName"] 128 | 129 | # COMMAND ---------- 130 | 131 | # MAGIC %md 132 | # MAGIC 133 | # MAGIC 134 | # MAGIC --- 135 | # MAGIC 136 | # MAGIC # Code Documentation for `get_latest_model_version` Function 137 | # MAGIC 138 | # MAGIC This document offers a comprehensive overview of the `get_latest_model_version` function which retrieves the latest version number of a specified model from MLflow's model registry. 139 | # MAGIC 140 | # MAGIC --- 141 | # MAGIC 142 | # MAGIC ## Function Overview 143 | # MAGIC 144 | # MAGIC ### `get_latest_model_version` 145 | # MAGIC 146 | # MAGIC Retrieves the latest version of a given model from the MLflow model registry. 147 | # MAGIC 148 | # MAGIC --- 149 | # MAGIC 150 | # MAGIC ## Detailed Function Description 151 | # MAGIC 152 | # MAGIC ### Function: `get_latest_model_version` 153 | # MAGIC 154 | # MAGIC #### Purpose 155 | # MAGIC 156 | # MAGIC - Fetches the most recent version of a specified model from the MLflow model registry. 157 | # MAGIC 158 | # MAGIC #### Parameters 159 | # MAGIC 160 | # MAGIC - `model_name`: Name of the model for which the latest version is to be fetched. 161 | # MAGIC 162 | # MAGIC #### Process 163 | # MAGIC 164 | # MAGIC 1. **Import MlflowClient**: Imports the `MlflowClient` class from the `mlflow.tracking.client` module. 165 | # MAGIC 2. **Initialize MLflow Client**: Instantiates an `MlflowClient` object. 166 | # MAGIC 3. **Retrieve Latest Model Versions**: Uses the `get_latest_versions` method to fetch the latest versions of the model. Only considers versions in the "None" stage. 167 | # MAGIC 4. **Iterate and Store Model Version**: Iterates through the returned model versions and extracts their version numbers. 168 | # MAGIC 5. **Return Latest Version**: Returns the most recent version number of the model. 169 | # MAGIC 170 | # MAGIC #### Libraries Used 171 | # MAGIC 172 | # MAGIC - `mlflow.tracking.client`: Required for the `MlflowClient` class which is used to interact with the MLflow tracking server. 173 | # MAGIC 174 | # MAGIC --- 175 | # MAGIC 176 | 177 | # COMMAND ---------- 178 | 179 | # Import the MlflowClient class from the mlflow.tracking.client module 180 | from mlflow.tracking.client import MlflowClient 181 | 182 | # Define a function to get the latest version of a given model 183 | def get_latest_model_version(model_name: str): 184 | # Instantiate an MlflowClient object 185 | client = MlflowClient() 186 | 187 | # Retrieve the latest versions of the specified model 188 | models = client.get_latest_versions(model_name) 189 | 190 | # Iterate through the returned models 191 | new_model_version = None 192 | for m in models: 193 | # Extract and store the version number of the model 194 | new_model_version = m.version 195 | 196 | # Return the latest version number 197 | return new_model_version 198 | 199 | # COMMAND ---------- 200 | 201 | # MAGIC %md 202 | # MAGIC 203 | # MAGIC # Code Documentation for Model Endpoint Configuration 204 | # MAGIC 205 | # MAGIC This document provides an in-depth overview of the Python code that constructs a JSON configuration for creating or updating a model serving endpoint. 206 | # MAGIC 207 | # MAGIC --- 208 | # MAGIC 209 | # MAGIC ## Code Sections 210 | # MAGIC 211 | # MAGIC ### 1. Import Required Libraries 212 | # MAGIC 213 | # MAGIC #### Purpose 214 | # MAGIC 215 | # MAGIC - Import the Python `requests` library for HTTP requests. 216 | # MAGIC 217 | # MAGIC #### Code Explanation 218 | # MAGIC 219 | # MAGIC - `import requests` 220 | # MAGIC 221 | # MAGIC #### Libraries Used 222 | # MAGIC 223 | # MAGIC - `requests`: Python library for HTTP operations. 224 | # MAGIC 225 | # MAGIC --- 226 | # MAGIC 227 | # MAGIC ### 2. Define JSON Configuration for Model Endpoint 228 | # MAGIC 229 | # MAGIC #### Purpose 230 | # MAGIC 231 | # MAGIC - Creates a JSON object that holds the configuration for the model serving endpoint. 232 | # MAGIC 233 | # MAGIC #### Code Explanation 234 | # MAGIC 235 | # MAGIC 1. `"name": model_serving_endpoint_name`: Specifies the name of the model serving endpoint. 236 | # MAGIC 2. `"config": {...}`: Holds the configuration details for the model serving endpoint. 237 | # MAGIC 3. `"served_models": [...]`: A list of dictionaries, each representing a model to be served. 238 | # MAGIC - `"model_name": model_name`: The name of the model. 239 | # MAGIC - `"model_version": get_latest_model_version(model_name=model_name)`: Calls a function to get the latest version of the specified model. 240 | # MAGIC - `"workload_size": "Small"`: Sets the workload size to "Small". 241 | # MAGIC - `"scale_to_zero_enabled": True`: Enables the endpoint to scale to zero instances when not in use. 242 | # MAGIC 243 | # MAGIC #### Libraries Used 244 | # MAGIC 245 | # MAGIC - None. 246 | # MAGIC 247 | # MAGIC #### Dependencies 248 | # MAGIC 249 | # MAGIC - `model_serving_endpoint_name`: Variable holding the endpoint name. 250 | # MAGIC - `model_name`: Variable holding the model name. 251 | # MAGIC - `get_latest_model_version()`: Function that retrieves the latest model version. 252 | # MAGIC 253 | # MAGIC #### JSON Structure 254 | # MAGIC 255 | # MAGIC ```json 256 | # MAGIC { 257 | # MAGIC "name": model_serving_endpoint_name, 258 | # MAGIC "config": { 259 | # MAGIC "served_models": [ 260 | # MAGIC { 261 | # MAGIC "model_name": model_name, 262 | # MAGIC "model_version": get_latest_model_version(model_name=model_name), 263 | # MAGIC "workload_size": "Small", 264 | # MAGIC "scale_to_zero_enabled": True 265 | # MAGIC } 266 | # MAGIC ] 267 | # MAGIC } 268 | # MAGIC } 269 | # MAGIC ``` 270 | # MAGIC 271 | 272 | # COMMAND ---------- 273 | 274 | import requests 275 | 276 | my_json = { 277 | "name": model_serving_endpoint_name, 278 | "config": { 279 | "served_models": [{ 280 | "model_name": model_name, 281 | "model_version": get_latest_model_version(model_name=model_name), 282 | "workload_size": "Small", 283 | "scale_to_zero_enabled": True 284 | }] 285 | } 286 | } 287 | 288 | # COMMAND ---------- 289 | 290 | my_json 291 | 292 | # COMMAND ---------- 293 | 294 | # MAGIC %md 295 | # MAGIC --- 296 | # MAGIC 297 | # MAGIC # Code Documentation for Model Serving Endpoint Functions 298 | # MAGIC 299 | # MAGIC This document provides an overview of two Python functions—`func_create_endpoint` and `func_delete_model_serving_endpoint`—used for managing model serving endpoints. 300 | # MAGIC 301 | # MAGIC ## Function Overview 302 | # MAGIC 303 | # MAGIC ### `func_create_endpoint` 304 | # MAGIC 305 | # MAGIC This function either creates a new model serving endpoint or updates an existing one based on the provided parameters. 306 | # MAGIC 307 | # MAGIC ### `func_delete_model_serving_endpoint` 308 | # MAGIC 309 | # MAGIC This function deletes an existing model serving endpoint based on its name. 310 | # MAGIC 311 | # MAGIC --- 312 | # MAGIC 313 | # MAGIC ## Detailed Function Descriptions 314 | # MAGIC 315 | # MAGIC ### Function: `func_create_endpoint` 316 | # MAGIC 317 | # MAGIC #### Purpose 318 | # MAGIC 319 | # MAGIC - Creates or updates the model serving endpoint. 320 | # MAGIC 321 | # MAGIC #### Parameters 322 | # MAGIC 323 | # MAGIC - `model_serving_endpoint_name`: Name of the model serving endpoint. 324 | # MAGIC - `instance`: API instance URL. 325 | # MAGIC - `headers`: HTTP headers for API requests. 326 | # MAGIC - `my_json`: JSON configuration for the model serving endpoint. 327 | # MAGIC 328 | # MAGIC #### Process 329 | # MAGIC 330 | # MAGIC 1. **Define Endpoint URL**: Composes the URL where the endpoint is or will be hosted. 331 | # MAGIC 2. **Check for Existing Endpoint**: Makes an HTTP GET request to check if the endpoint already exists. 332 | # MAGIC 3. **Create or Update Endpoint**: 333 | # MAGIC - If the endpoint does not exist, it creates a new one with the specified configuration. 334 | # MAGIC - If the endpoint does exist, it updates the configuration. 335 | # MAGIC 4. **Poll for Configuration Activation**: Waits until the new configuration is active. Stops waiting after a pre-defined time (10 minutes). 336 | # MAGIC 5. **Status Code Verification**: Checks that the API call was successful. 337 | # MAGIC 338 | # MAGIC #### Libraries Used 339 | # MAGIC 340 | # MAGIC - `requests`: For making HTTP calls. 341 | # MAGIC - `time`: For adding sleep functionality. 342 | # MAGIC - `json`: For JSON parsing. 343 | # MAGIC - `assertpy`: For assertions. 344 | # MAGIC 345 | # MAGIC ### Function: `func_delete_model_serving_endpoint` 346 | # MAGIC 347 | # MAGIC #### Purpose 348 | # MAGIC 349 | # MAGIC - Deletes an existing model serving endpoint. 350 | # MAGIC 351 | # MAGIC #### Parameters 352 | # MAGIC 353 | # MAGIC - `model_serving_endpoint_name`: Name of the model serving endpoint. 354 | # MAGIC - `instance`: API instance URL. 355 | # MAGIC - `headers`: HTTP headers for API requests. 356 | # MAGIC 357 | # MAGIC #### Process 358 | # MAGIC 359 | # MAGIC 1. **Define Endpoint URL**: Composes the URL where the endpoint is hosted. 360 | # MAGIC 2. **Delete Endpoint**: Makes an HTTP DELETE request to remove the endpoint. 361 | # MAGIC 3. **Status Verification**: Checks if the deletion was successful and raises an exception if it fails. 362 | # MAGIC 363 | # MAGIC #### Libraries Used 364 | # MAGIC 365 | # MAGIC - `requests`: For making HTTP calls. 366 | # MAGIC 367 | # MAGIC --- 368 | # MAGIC 369 | # MAGIC This should give a detailed explanation of what each function is doing and how it accomplishes its goals. 370 | 371 | # COMMAND ---------- 372 | 373 | import requests 374 | import time 375 | import json 376 | import assertpy 377 | 378 | def func_create_endpoint(model_serving_endpoint_name, instance, headers, my_json): 379 | """ 380 | Create or update the model serving endpoint. 381 | """ 382 | 383 | # Define the endpoint URL 384 | endpoint_url = f"https://{instance}/api/2.0/serving-endpoints" 385 | url = f"{endpoint_url}/{model_serving_endpoint_name}" 386 | 387 | # Check if the endpoint already exists 388 | r = requests.get(url, headers=headers) 389 | if "RESOURCE_DOES_NOT_EXIST" in r.text: 390 | print(f"Creating new endpoint: ",f"https://{instance}/serving-endpoints/{model_serving_endpoint_name}/invocations") 391 | re = requests.post(headers=headers, url=endpoint_url, json=my_json) 392 | else: 393 | # Extract the new model version from the JSON configuration 394 | new_model_version = my_json['config']['served_models'][0]['model_version'] 395 | print(f"This endpoint existed previously! Updating it to new model version: {new_model_version}") 396 | 397 | # Update endpoint with new config 398 | url = f"{endpoint_url}/{model_serving_endpoint_name}/config" 399 | re = requests.put(url, headers=headers, json=my_json['config']) 400 | 401 | # Poll until the new configuration is active 402 | total_wait = 0 403 | while True: 404 | r = requests.get(url, headers=headers) 405 | assertpy.assert_that(r.status_code).is_equal_to(200) 406 | 407 | endpoint = json.loads(r.text) 408 | if "pending_config" in endpoint.keys(): 409 | seconds = 10 410 | print("New config still pending") 411 | if total_wait < 600: # 10 minutes 412 | print(f"Waiting for {seconds} seconds. Total wait time: {total_wait} seconds.") 413 | time.sleep(seconds) 414 | total_wait += seconds 415 | else: 416 | print(f"Stopping after {total_wait} seconds of waiting.") 417 | break 418 | else: 419 | print("New config in place now!") 420 | break 421 | # Check the response code 422 | assertpy.assert_that(re.status_code).is_equal_to(200) 423 | 424 | def func_delete_model_serving_endpoint(model_serving_endpoint_name, instance, headers): 425 | """ 426 | Delete the model serving endpoint. 427 | """ 428 | 429 | # Define the endpoint URL 430 | endpoint_url = f"https://{instance}/ajax-api/2.0/serving-endpoints" 431 | url = f"{endpoint_url}/{model_serving_endpoint_name}" 432 | 433 | # Delete the endpoint 434 | response = requests.delete(url, headers=headers) 435 | 436 | if response.status_code != 200: 437 | raise Exception(f"Request failed with status {response.status_code}, {response.text}") 438 | else: 439 | print(f"{model_serving_endpoint_name} endpoint is deleted!") 440 | 441 | 442 | # COMMAND ---------- 443 | 444 | func_create_endpoint(model_serving_endpoint_name, instance, headers, my_json) 445 | -------------------------------------------------------------------------------- /Chapter-09/util/training.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from delta.tables import DeltaTable 3 | import tempfile 4 | import os 5 | import logging 6 | import shutil 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import pyspark 12 | import pyspark.sql.functions as F 13 | 14 | import math 15 | from sklearn.pipeline import Pipeline 16 | from sklearn.compose import ColumnTransformer 17 | from sklearn.impute import SimpleImputer 18 | from sklearn.preprocessing import OneHotEncoder 19 | from sklearn.compose import make_column_selector as selector 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.ensemble import RandomForestRegressor 22 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 23 | 24 | 25 | import mlflow 26 | from mlflow.tracking import MlflowClient 27 | from mlflow.exceptions import RestException 28 | from mlflow.models.signature import ModelSignature 29 | from mlflow.types.schema import Schema, ColSpec, DataType 30 | 31 | # COMMAND ---------- 32 | 33 | #mlflow util functions to manage models 34 | def transition_model(model_version, stage): 35 | """ 36 | Transition a model to a specified stage in MLflow Model Registry using the associated 37 | mlflow.entities.model_registry.ModelVersion object. 38 | 39 | Args: 40 | model_version: mlflow.entities.model_registry.ModelVersion. ModelVersion object to transition 41 | stage: (str) New desired stage for this model version. One of "Staging", "Production", "Archived" or "None" 42 | 43 | Returns: 44 | A single mlflow.entities.model_registry.ModelVersion object 45 | """ 46 | client = MlflowClient() 47 | 48 | # Check if the stage is valid 49 | if stage not in ["Staging", "Production", "Archived", "None"]: 50 | raise ValueError(f"Invalid stage: {stage}") 51 | 52 | # Transition the model version 53 | model_version = client.transition_model_version_stage( 54 | name=model_version.name, 55 | version=model_version.version, 56 | stage=stage, 57 | archive_existing_versions=True, 58 | ) 59 | 60 | return model_version 61 | 62 | 63 | def fetch_model_version(registry_model_name, stage="Staging"): 64 | """ 65 | For a given registered model, return the MLflow ModelVersion object 66 | This contains all metadata needed, such as params logged etc 67 | 68 | Args: 69 | registry_model_name: (str) Name of MLflow Registry Model 70 | stage: (str) Stage for this model. One of "Staging" or "Production" 71 | 72 | Returns: 73 | mlflow.entities.model_registry.ModelVersion 74 | """ 75 | client = MlflowClient() 76 | filter_string = f'name="{registry_model_name}"' 77 | registered_model = client.search_registered_models(filter_string=filter_string)[0] 78 | 79 | # Check if the stage is valid 80 | if stage not in ["Staging", "Production"]: 81 | raise ValueError(f"Invalid stage: {stage}") 82 | 83 | # Get the latest model version in the desired stage 84 | model_version = next( 85 | (model_version for model_version in registered_model.latest_versions if model_version.current_stage == stage), 86 | None 87 | ) 88 | 89 | return model_version 90 | 91 | 92 | def get_run_from_registered_model(registry_model_name, stage="Staging"): 93 | """ 94 | Get Mlflow run object from registered model 95 | 96 | Args: 97 | registry_model_name: (str) Name of MLflow Registry Model 98 | stage: (str) Stage for this model. One of "Staging" or "Production" 99 | 100 | Returns: 101 | mlflow.entities.run.Run 102 | """ 103 | client = MlflowClient() 104 | filter_string = f'name="{registry_model_name}"' 105 | registered_model = client.search_registered_models(filter_string=filter_string)[0] 106 | 107 | # Check if the stage is valid 108 | if stage not in ["Staging", "Production"]: 109 | raise ValueError(f"Invalid stage: {stage}") 110 | 111 | # Get the latest model version in the desired stage 112 | model_version = next( 113 | (model_version for model_version in registered_model.latest_versions if model_version.current_stage == stage), 114 | None 115 | ) 116 | 117 | if model_version is None: 118 | raise ValueError(f"No model version found in stage {stage} for model {registry_model_name}") 119 | 120 | run_id = model_version.run_id 121 | run = mlflow.get_run(run_id) 122 | 123 | return run 124 | 125 | 126 | def cleanup_registered_model(registry_model_name: str) -> None: 127 | """ 128 | Deletes a registered model in MLflow model registry. 129 | 130 | To delete a model in the model registry, all model versions must first be archived. 131 | This function first archives all versions of a model in the registry, and then deletes the model. 132 | 133 | Args: 134 | registry_model_name: The name of the model in the MLflow model registry. 135 | """ 136 | client = MlflowClient() 137 | 138 | filter_string = f'name="{registry_model_name}"' 139 | 140 | model_versions = client.search_model_versions(filter_string=filter_string) 141 | 142 | if len(model_versions) == 0: 143 | logging.info("No registered models to delete") 144 | return 145 | 146 | logging.info(f"Deleting following registered model: {registry_model_name}") 147 | 148 | # Move any versions of the model to Archived 149 | for model_version in model_versions: 150 | try: 151 | if model_version.current_stage!='Archived': 152 | client.transition_model_version_stage( 153 | name=model_version.name, 154 | version=model_version.version, 155 | stage="Archived", 156 | ) 157 | except Exception as e: 158 | logging.exception(f"Error archiving version {model_version.version} of model {registry_model_name}") 159 | raise 160 | 161 | try: 162 | client.delete_registered_model(registry_model_name) 163 | except RestException as e: 164 | logging.exception(f"Error deleting registered model {registry_model_name}") 165 | raise 166 | 167 | 168 | 169 | # COMMAND ---------- 170 | 171 | #delete any registered model from registry 172 | cleanup_registered_model(mlflow_experiment_name) 173 | 174 | # COMMAND ---------- 175 | 176 | 177 | #delta table utility functions 178 | def get_delta_version(delta_path: str) -> int: 179 | """ 180 | Gets the latest version of a Delta table given the path to the table. 181 | 182 | Args: 183 | delta_path: The path to the Delta table 184 | 185 | Returns: 186 | The version of the Delta table. 187 | """ 188 | try: 189 | delta_table = DeltaTable.forPath(spark, delta_path) 190 | delta_history= delta_table.history() 191 | 192 | # Retrieve the lastest Delta version - this is the version loaded when reading from delta_path 193 | delta_version = delta_history.first()["version"] 194 | 195 | return delta_version 196 | 197 | except AnalysisException as e: 198 | raise ValueError(f"Error getting Delta table version: {e}") 199 | 200 | def load_delta_table_from_run(run: mlflow.entities.run.Run) -> pyspark.sql.DataFrame: 201 | """ 202 | Given an MLflow run, load the Delta table which was used for that run, 203 | using the path and version tracked at tracking time. 204 | 205 | Note that by default Delta tables only retain a commit history for 30 days, meaning 206 | that previous versions older than 30 days will be deleted by default. This property can 207 | be updated using the Delta table property `delta.logRetentionDuration`. 208 | 209 | For more information, see https://docs.databricks.com/delta/delta-batch.html#data-retention 210 | 211 | Args: 212 | run: The MLflow run object. 213 | 214 | Returns: 215 | The Spark DataFrame for the Delta table used in the run. 216 | """ 217 | delta_path = run.data.params.get("delta_path") 218 | delta_version = run.data.params.get("delta_version") 219 | if not delta_path or not delta_version: 220 | raise ValueError("Error: missing delta_path or delta_version parameters.") 221 | print(f"Loading Delta table from path: {delta_path}; version: {delta_version}") 222 | try: 223 | df = spark.read.format("delta").option("versionAsOf", delta_version).load(delta_path) 224 | return df 225 | except Exception as e: 226 | print(f"Error: could not load Delta table. {str(e)}") 227 | raise 228 | 229 | # COMMAND ---------- 230 | 231 | def calculate_summary_stats(pdf: pd.DataFrame) -> pd.DataFrame: 232 | """ 233 | Create a pandas DataFrame of summary statistics for a provided pandas DataFrame. 234 | Involved calling .describe on pandas DataFrame provided and additionally add 235 | median values and a count of null values for each column. 236 | 237 | :param pdf: pandas DataFrame 238 | :return: pandas DataFrame of sumary statistics for each column 239 | """ 240 | stats_pdf = pdf.describe(include="all") 241 | 242 | # Add median values row 243 | median_vals = pdf.median() 244 | stats_pdf.loc["median"] = median_vals 245 | 246 | # Add null values row 247 | null_count = pdf.isna().sum() 248 | stats_pdf.loc["null_count"] = null_count 249 | 250 | return stats_pdf 251 | 252 | 253 | def log_summary_stats_pdf_as_csv(pdf: pd.DataFrame) -> None: 254 | """ 255 | Log summary statistics pandas DataFrame as a csv file to MLflow as an artifact. 256 | 257 | Args: 258 | pdf: A pandas DataFrame containing summary statistics. 259 | """ 260 | with tempfile.NamedTemporaryFile(prefix="summary_stats", suffix=".csv", delete=False) as temp: 261 | pdf.to_csv(temp.name, index=True) 262 | artifact_name = "summary_stats.csv" 263 | shutil.move(temp.name, artifact_name) 264 | mlflow.log_artifact(artifact_name, artifact_path="summary_stats") 265 | os.remove(artifact_name) 266 | 267 | 268 | def load_summary_stats_pdf_from_run(run: mlflow.entities.run.Run, local_tmp_dir: str) -> pd.DataFrame: 269 | """ 270 | Given an MLflow run, download the summary stats csv artifact to a local_tmp_dir and load the 271 | csv into a pandas DataFrame. 272 | 273 | Args: 274 | run: The MLflow run object. 275 | local_tmp_dir: (str) path to a local filesystem tmp directory 276 | 277 | Returns: 278 | A pandas DataFrame containing statistics computed during training. 279 | """ 280 | 281 | # Use mlflow to download the csv file logged in the artifacts of a run to a local tmp path 282 | Path(local_tmp_dir).mkdir(parents=True, exist_ok=True) 283 | local_path=mlflow.artifacts.download_artifacts(run_id=run.info.run_id, artifact_path="summary_stats", dst_path=local_tmp_dir) 284 | print(f"Summary stats artifact downloaded in: {local_path}") 285 | 286 | # Load the csv into a pandas DataFrame 287 | summary_stats_path = os.path.join(local_path, os.listdir(local_path)[0]) 288 | try: 289 | summary_stats_pdf = pd.read_csv(summary_stats_path, index_col="Unnamed: 0") 290 | except Exception as e: 291 | raise ValueError(f"Failed to load summary stats csv from path {summary_stats_path}: {e}") 292 | 293 | return summary_stats_pdf 294 | 295 | # COMMAND ---------- 296 | 297 | def create_sklearn_rf_pipeline(model_params, seed=42): 298 | """ 299 | Create the sklearn pipeline required for the RandomForestRegressor. 300 | We compose two components of the pipeline separately - one for numeric cols, one for categorical cols 301 | These are then combined with the final RandomForestRegressor stage, which uses the model_params dict 302 | provided via the args. The unfitted pipeline is returned. 303 | 304 | For a robust pipeline in practice, one should also have a pipeline stage to add indicator columns for those features 305 | which have been imputed. This can be useful to encode information about those instances which have been imputed with 306 | a given value. We refrain from doing so here to simplify the pipeline, and focus on the overall workflow. 307 | 308 | Args: 309 | model_params: (dict) Dictionary of model parameters to pass into sklearn RandomForestRegressor 310 | seed : (int) Random seed to set via random_state arg in RandomForestRegressor 311 | 312 | Returns: 313 | sklearn pipeline 314 | """ 315 | # Create pipeline component for numeric Features 316 | numeric_transformer = Pipeline(steps=[ 317 | ("imputer", SimpleImputer(strategy='median'))]) 318 | 319 | # Create pipeline component for categorical Features 320 | categorical_transformer = Pipeline(steps=[ 321 | ("imputer", SimpleImputer(strategy="most_frequent")), 322 | ("ohe", OneHotEncoder(handle_unknown="ignore"))]) 323 | 324 | # Combine numeric and categorical components into one preprocessor pipeline 325 | # Use ColumnTransformer to apply the different preprocessing pipelines to different subsets of features 326 | # Use selector (make_column_selector) to select which subset of features to apply pipeline to 327 | preprocessor = ColumnTransformer(transformers=[ 328 | ("numeric", numeric_transformer, selector(dtype_exclude="category")), 329 | ("categorical", categorical_transformer, selector(dtype_include="category")) 330 | ]) 331 | 332 | pipeline = Pipeline(steps=[ 333 | ("preprocessor", preprocessor), 334 | ("rf", RandomForestRegressor(random_state=seed, **model_params)) 335 | ]) 336 | 337 | return pipeline 338 | 339 | def train_sklearn_rf_model(run_name, delta_path, model_params, misc_params, seed=42): 340 | """ 341 | Function to trigger training and evaluation of an sklearn RandomForestRegressor model. 342 | 343 | Parameters, metrics, and artifacts are logged to MLflow during this process. 344 | 345 | Returns the MLflow run object. 346 | 347 | Args: 348 | run_name: (str) Name to give to MLflow run. 349 | delta_path: (str) Path to Delta table to use as input data. 350 | model_params: (dict) Dictionary of model parameters to pass into sklearn RandomForestRegressor. 351 | misc_params: (dict) Dictionary of parameters to use. 352 | seed: (int) Random seed. 353 | 354 | Returns: 355 | mlflow.entities.run.Run 356 | """ 357 | 358 | #end any active run 359 | mlflow.end_run() 360 | 361 | # Enable MLflow autologging. 362 | mlflow.autolog(log_input_examples=True, silent=True) 363 | 364 | # Load Delta table from `delta_path`. 365 | df = spark.read.format("delta").load(delta_path) 366 | 367 | # Log `delta_path` and version. 368 | mlflow.log_param("delta_path", delta_path) 369 | delta_version = get_delta_version(delta_path) 370 | mlflow.log_param("delta_version", delta_version) 371 | 372 | # Track misc parameters used in pipeline creation (preprocessing) as JSON artifact. 373 | mlflow.log_dict(misc_params, "preprocessing_params.json") 374 | 375 | # Convert Spark DataFrame to pandas, as we will be training an sklearn model. 376 | pdf = df.toPandas() 377 | 378 | # Convert all categorical columns to category dtype. 379 | for c in misc_params["cat_cols"]: 380 | pdf[c] = pdf[c].astype("category") 381 | 382 | #keek only the required columns 383 | cols_to_keep = np.concatenate(([misc_params['target_col']], misc_params['cat_cols'], misc_params['num_cols']), axis=None) 384 | pdf = pdf[cols_to_keep] 385 | 386 | 387 | # Create summary statistics pandas DataFrame and log as a CSV to MLflow. 388 | summary_stats_pdf = calculate_summary_stats(pdf[cols_to_keep]) 389 | log_summary_stats_pdf_as_csv(summary_stats_pdf) 390 | 391 | # Track number of total instances and "month". 392 | num_instances = pdf.shape[0] 393 | mlflow.log_param("num_instances", num_instances) # Log number of instances. 394 | mlflow.log_param("month", misc_params["month"]) # Log month number. 395 | 396 | # Split data. 397 | X = pdf.drop([misc_params["target_col"]], axis=1) 398 | y = pdf[misc_params["target_col"]] 399 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed) 400 | 401 | # Track train/test data info as parameters. 402 | num_training = X_train.shape[0] 403 | mlflow.log_param("num_training_instances", num_training) 404 | num_test = X_test.shape[0] 405 | mlflow.log_param("num_test_instances", num_test) 406 | 407 | # Fit sklearn pipeline with RandomForestRegressor model. 408 | rf_pipeline = create_sklearn_rf_pipeline(model_params) 409 | rf_pipeline.fit(X_train, y_train) 410 | 411 | # Make predictions on the test data 412 | y_pred = rf_pipeline.predict(X_test) 413 | 414 | # Calculate evaluation metrics on the test data 415 | mae = mean_absolute_error(y_test, y_pred) 416 | mse = mean_squared_error(y_test, y_pred) 417 | r2 = r2_score(y_test, y_pred) 418 | rmse = math.sqrt(mse) 419 | 420 | # Specify data schema which the model will use as its ModelSignature. 421 | 422 | input_schema = Schema([ 423 | ColSpec(name="Weather_Condition", type=DataType.string), 424 | ColSpec(name="Promotion_Type", type=DataType.string), 425 | ColSpec(name="Device_Type", type=DataType.string), 426 | ColSpec(name="Temperature", type=DataType.float), 427 | ColSpec(name="Website_Traffic", type=DataType.integer) 428 | ]) 429 | 430 | output_schema = Schema([ColSpec("integer")]) 431 | signature = ModelSignature(input_schema, output_schema) 432 | mlflow.sklearn.log_model(rf_pipeline, "model", signature=signature) 433 | 434 | return mlflow.active_run() 435 | 436 | 437 | 438 | -------------------------------------------------------------------------------- /Chapter-09/util/monitoring.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC #### Monitoring Utility Functions 5 | # MAGIC 6 | # MAGIC The following functions check 7 | # MAGIC - the proportion of nulls 8 | # MAGIC - the differences in summary statistics 9 | # MAGIC - the shifts in distributions 10 | 11 | # COMMAND ---------- 12 | 13 | 14 | from scipy import stats 15 | import matplotlib.pyplot as plt 16 | import seaborn as sns 17 | 18 | # COMMAND ---------- 19 | 20 | def check_null_proportion(new_pdf, null_proportion_threshold): 21 | """ 22 | Function to compute the proportions of nulls for all columns in a Spark DataFrame and return any features that exceed the specified null threshold. 23 | 24 | Args: 25 | df: (pd.DataFrame) The DataFrame that contains new incoming data. 26 | null_proportion_threshold: (float) A numeric value ranging from 0 and 1 that specifies the tolerable fraction of nulls. 27 | 28 | Returns: 29 | A dictionary mapping feature names to their null proportions. 30 | 31 | Raises: 32 | ValueError: If the null proportion threshold is not between 0 and 1. 33 | 34 | Notes: 35 | * This function uses the `isnull()` method to identify null values in the DataFrame. 36 | * The `sum()` method is used to count the number of null values in each column. 37 | * The `len()` method is used to get the total number of rows in the DataFrame. 38 | * The `transpose()` method is used to convert the DataFrame from a long format to a wide format. 39 | * The `assert` statement is used to check that the null proportion threshold is between 0 and 1. 40 | * The `print()` statement is used to print an alert if there are any features that exceed the null proportion threshold. 41 | """ 42 | 43 | # Check that the null proportion threshold is between 0 and 1. 44 | if null_proportion_threshold < 0 or null_proportion_threshold > 1: 45 | raise ValueError( 46 | "The null proportion threshold must be between 0 and 1. " 47 | f"Received: {null_proportion_threshold}" 48 | ) 49 | 50 | # Compute the proportions of nulls for all columns in the DataFrame. 51 | missing_stats = pd.DataFrame(new_pdf.isnull().sum() / len(new_pdf)).transpose() 52 | 53 | # Get a list of the column names that exceed the null proportion threshold. 54 | null_col_list = missing_stats.columns[(missing_stats >= null_proportion_threshold).iloc[0]] 55 | 56 | # Create a dictionary mapping feature names to their null proportions. 57 | null_dict = {} 58 | for feature in null_col_list: 59 | null_dict[feature] = missing_stats[feature][0] 60 | 61 | # Check if any features exceed the null proportion threshold. 62 | if len(null_dict) > 0: 63 | print("Alert: There are feature(s) that exceed(s) the expected null threshold. Please ensure that the data is ingested correctly") 64 | print(null_dict) 65 | 66 | # Return the dictionary of null proportions. 67 | return null_dict 68 | 69 | 70 | # COMMAND ---------- 71 | 72 | def check_diff_in_summary_stats(new_stats_pdf, prod_stats_pdf, num_cols, stats_threshold_limit, statistic_list): 73 | """ 74 | Function to check if the new summary stats significantly deviates from the summary stats in the production data by a certain threshold. 75 | 76 | Args: 77 | new_stats_pdf: (pd.DataFrame) summary statistics of incoming data 78 | prod_stats_pdf: (pd.DataFrame) summary statistics of production data 79 | num_cols: (list) a list of numeric columns 80 | stats_threshold_limit: (float) a float < 1 that signifies the threshold limit 81 | statistic_list: (list) a list of statistics, e.g., mean, std, min, max 82 | 83 | Returns: 84 | A list of feature names that significantly deviate from the production data. 85 | 86 | Raises: 87 | ValueError: If the stats_threshold_limit is not between 0 and 1. 88 | 89 | Notes: 90 | * This function uses the `loc` method to get the value of a specific statistic for a given feature. 91 | * The `round` method is used to round a number to a specified number of decimal places. 92 | * The `print` statement is used to print the results of the function. 93 | """ 94 | 95 | # Check that the stats_threshold_limit is between 0 and 1. 96 | if stats_threshold_limit < 0 or stats_threshold_limit > 1: 97 | raise ValueError( 98 | "The stats_threshold_limit must be between 0 and 1. " 99 | f"Received: {stats_threshold_limit}" 100 | ) 101 | 102 | # Create a list of feature names that significantly deviate from the production data. 103 | feature_diff_list = [] 104 | 105 | # Iterate over the numeric columns. 106 | for feature in num_cols: 107 | 108 | # Print a message indicating that the feature is being checked. 109 | print(f"\nCHECKING {feature}.........") 110 | 111 | # Iterate over the statistics. 112 | for statistic in statistic_list: 113 | 114 | # Get the value of the statistic for the feature in the production data. 115 | prod_stat_value = prod_stats_pdf[[str(feature)]].loc[str(statistic)][0] 116 | 117 | # Calculate the upper and lower threshold limits for the statistic. 118 | upper_val_limit = prod_stat_value * (1 + stats_threshold_limit) 119 | lower_val_limit = prod_stat_value * (1 - stats_threshold_limit) 120 | 121 | # Get the value of the statistic for the feature in the new data. 122 | new_stat_value = new_stats_pdf[[str(feature)]].loc[str(statistic)][0] 123 | 124 | # Check if the new statistic value is outside of the threshold limits. 125 | if new_stat_value < lower_val_limit: 126 | feature_diff_list.append(str(feature)) 127 | print(f"\tThe {statistic} {feature} in the new data is at least {stats_threshold_limit * 100}% lower than the {statistic} in the production data. Decreased from {round(prod_stat_value, 2)} to {round(new_stat_value, 2)}.") 128 | 129 | elif new_stat_value > upper_val_limit: 130 | feature_diff_list.append(str(feature)) 131 | print(f"\tThe {statistic} {feature} in the new data is at least {stats_threshold_limit * 100}% higher than the {statistic} in the production data. Increased from {round(prod_stat_value, 2)} to {round(new_stat_value, 2)}.") 132 | 133 | # Return the list of feature names that significantly deviate from the production data. 134 | return np.unique(feature_diff_list) 135 | 136 | 137 | # COMMAND ---------- 138 | 139 | def check_diff_in_variances(reference_df, new_df, num_cols, p_threshold): 140 | """ 141 | Function to check if the variances of the numeric columns in `new_df` are significantly different from the variances of the corresponding columns in `reference_df`. 142 | 143 | Args: 144 | reference_df: (pd.DataFrame) The DataFrame that contains the production data. 145 | new_df: (pd.DataFrame) The DataFrame that contains the new data. 146 | num_cols: (list) A list of the names of the numeric columns. 147 | p_threshold: (float) The p-value threshold for significance. 148 | 149 | Returns: 150 | A dictionary mapping feature names to their p-values. 151 | 152 | Raises: 153 | ValueError: If `p_threshold` is not between 0 and 1. 154 | 155 | Notes: 156 | * This function uses the `levene()` function from the `scipy.stats` module to perform the Levene test. 157 | * The `assert` statement is used to check that `p_threshold` is between 0 and 1. 158 | * The `print()` statements are used to print the results of the function. 159 | """ 160 | 161 | # Check that `p_threshold` is between 0 and 1. 162 | if p_threshold < 0 or p_threshold > 1: 163 | raise ValueError( 164 | "The p_threshold must be between 0 and 1. " 165 | f"Received: {p_threshold}" 166 | ) 167 | 168 | # Create a dictionary mapping feature names to their p-values. 169 | var_dict = {} 170 | 171 | # Iterate over the numeric columns. 172 | for feature in num_cols: 173 | 174 | # Perform the Levene test. 175 | levene_stat, levene_pval = stats.levene(reference_df[feature], new_df[feature], center="median") 176 | 177 | # If the p-value is less than or equal to the threshold, then the variances are significantly different. 178 | if levene_pval <= p_threshold: 179 | var_dict[feature] = levene_pval 180 | 181 | # Check if any features have significantly different variances. 182 | if len(var_dict) > 0: 183 | print(f"The feature(s) below have significantly different variances compared to production data at p-value {p_threshold}") 184 | print(var_dict) 185 | else: 186 | print(f"No features have significantly different variances compared to production data at p-value {p_threshold}") 187 | 188 | # Return the dictionary of p-values. 189 | return var_dict 190 | 191 | 192 | # COMMAND ---------- 193 | 194 | def check_dist_ks_bonferroni_test(reference_df, new_df, num_cols, p_threshold, ks_alternative="two-sided"): 195 | """ 196 | Function to take two pandas DataFrames and compute the Kolmogorov-Smirnov statistic on 2 sample distributions 197 | where the variable in question is continuous. 198 | This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous 199 | distribution. If the KS statistic is small or the p-value is high, then we cannot reject the hypothesis that 200 | the distributions of the two samples are the same. 201 | The alternative hypothesis can be either ‘two-sided’ (default), ‘less’ or ‘greater’. 202 | This function assumes that the distributions to compare have the same column name in both DataFrames. 203 | 204 | Args: 205 | reference_df: pandas DataFrame containing column with the distribution to be compared 206 | new_df: pandas DataFrame containing column with the distribution to be compared 207 | num_cols: (list) A list of the names of the numeric columns. 208 | p_threshold: (float) The p-value threshold for significance. 209 | ks_alternative: Defines the alternative hypothesis - ‘two-sided’ (default), ‘less’ or ‘greater’. 210 | 211 | Returns: 212 | A dictionary mapping feature names to their p-values. 213 | 214 | Raises: 215 | ValueError: If `p_threshold` is not between 0 and 1. 216 | 217 | Notes: 218 | * This function uses the `ks_2samp()` function from the `scipy.stats` module to perform the Kolmogorov-Smirnov test. 219 | * The `assert` statement is used to check that `p_threshold` is between 0 and 1. 220 | * The `print()` statements are used to print the results of the function. 221 | * The Bonferroni correction is used to adjust the p-value threshold to account for multiple comparisons. 222 | """ 223 | 224 | # Check that `p_threshold` is between 0 and 1. 225 | if p_threshold < 0 or p_threshold > 1: 226 | raise ValueError( 227 | "The p_threshold must be between 0 and 1. " 228 | f"Received: {p_threshold}" 229 | ) 230 | 231 | # Compute the Bonferroni-corrected alpha level. 232 | corrected_alpha = p_threshold / len(num_cols) 233 | 234 | # Create a dictionary mapping feature names to their p-values. 235 | ks_dict = {} 236 | 237 | # Iterate over the numeric columns. 238 | for feature in num_cols: 239 | 240 | # Compute the Kolmogorov-Smirnov statistic and p-value. 241 | ks_stat, ks_pval = stats.ks_2samp(reference_df[feature], new_df[feature], alternative=ks_alternative, mode="asymp") 242 | 243 | # If the p-value is less than or equal to the corrected alpha level, then the distributions are significantly different. 244 | if ks_pval <= corrected_alpha: 245 | ks_dict[feature] = ks_pval 246 | 247 | # Check if any features have significantly different distributions. 248 | if len(ks_dict) > 0: 249 | print(f"The feature(s) below have significantly different distributions compared to production data at Bonferroni-corrected alpha level of {round(corrected_alpha, 4)}, according to the KS test") 250 | print("\t", ks_dict) 251 | else: 252 | print(f"No feature distributions has shifted according to the KS test at the Bonferroni-corrected alpha level of {round(corrected_alpha, 4)}. ") 253 | 254 | # Return the dictionary of p-values. 255 | return ks_dict 256 | 257 | # COMMAND ---------- 258 | 259 | def check_categorical_diffs(reference_pdf, new_pdf, cat_cols, p_threshold): 260 | """ 261 | This function checks if there are differences in expected counts for categorical variables between the incoming data and the data in production. 262 | 263 | Args: 264 | reference_pdf: (pandas DataFrame) new incoming data 265 | new_pdf: (pandas DataFrame) data in production 266 | cat_cols: (list) a list of categorical columns 267 | 268 | Returns: 269 | A dictionary mapping feature names to their p-values. 270 | 271 | Raises: 272 | ValueError: If `p_threshold` is not between 0 and 1. 273 | 274 | Notes: 275 | * This function uses the `chisquare()` function from the `scipy.stats` module to perform the chi-squared test. 276 | * The `assert` statement is used to check that `p_threshold` is between 0 and 1. 277 | * The `print()` statements are used to print the results of the function. 278 | """ 279 | 280 | # Check that `p_threshold` is between 0 and 1. 281 | if p_threshold < 0 or p_threshold > 1: 282 | raise ValueError( 283 | "The p_threshold must be between 0 and 1. " 284 | f"Received: {p_threshold}" 285 | ) 286 | 287 | # Create a dictionary mapping feature names to their p-values. 288 | chi_dict = {} 289 | 290 | # Iterate over the categorical columns. 291 | for feature in cat_cols: 292 | 293 | # Calculate the observed frequencies by creating a contingency table using pd.crosstab 294 | observed_freq = pd.crosstab(reference_pdf[feature], new_pdf[feature]) 295 | 296 | # Perform the Chi-Square test of independence 297 | chi2, p_value, _, _ = stats.chi2_contingency(observed_freq) 298 | 299 | # If the p-value is less than or equal to the threshold, then the expected counts are significantly different. 300 | if p_value <= p_threshold: 301 | chi_dict[feature] = p_value 302 | 303 | # Check if any features have significantly different expected counts. 304 | if len(chi_dict) > 0: 305 | print(f"The following categorical variables have significantly different expected counts compared to the production data at p-value {p_threshold}:") 306 | print("\t", chi_dict) 307 | else: 308 | print(f"No categorical variables have significantly different expected counts compared to the production data at p-value {p_threshold}.") 309 | 310 | return chi_dict 311 | 312 | # COMMAND ---------- 313 | 314 | def compare_model_perfs(current_staging_run, current_prod_run, min_model_perf_threshold, metric_to_check): 315 | """ 316 | This function compares the performances of the models in staging and in production. 317 | 318 | Args: 319 | current_staging_run: MLflow run that contains information on the staging model 320 | current_prod_run: MLflow run that contains information on the production model 321 | min_model_perf_threshold (float): The minimum threshold that the staging model should exceed before being transitioned to production 322 | metric_to_check (string): The metric that the user is interested in using to compare model performances 323 | 324 | Returns: 325 | Recommendation to transition the staging model to production or not 326 | 327 | Raises: 328 | ValueError: If `min_model_perf_threshold` is not positive. 329 | 330 | Notes: 331 | * This function uses the `data.metrics` attribute of the MLflow runs to get the metrics for the staging and production models. 332 | * The `round()` function is used to round the difference in performance to two decimal places. 333 | * The `print()` statements are used to print the results of the function. 334 | """ 335 | 336 | # Check that `min_model_perf_threshold` is positive. 337 | if min_model_perf_threshold < 0: 338 | raise ValueError( 339 | "The min_model_perf_threshold must be positive. " 340 | f"Received: {min_model_perf_threshold}" 341 | ) 342 | 343 | # Calculate the difference in performance between the staging and production models. 344 | model_diff_fraction = current_staging_run.data.metrics[str(metric_to_check)] / current_prod_run.data.metrics[str(metric_to_check)] 345 | model_diff_percent = round((model_diff_fraction - 1)*100, 2) 346 | 347 | # Print the performance of the staging and production models. 348 | print(f"Staging run's {metric_to_check}: {round(current_staging_run.data.metrics[str(metric_to_check)],3)}") 349 | print(f"Current production run's {metric_to_check}: {round(current_prod_run.data.metrics[str(metric_to_check)],3)}") 350 | 351 | # Recommend whether to transition the staging model to production. 352 | if model_diff_percent >= 0 and (model_diff_fraction - 1 >= min_model_perf_threshold): 353 | print(f"The current staging run exceeds the model improvement threshold of at least +{min_model_perf_threshold}. You may proceed with transitioning the staging model to production now.") 354 | 355 | elif model_diff_percent >= 0 and (model_diff_fraction - 1 < min_model_perf_threshold): 356 | print(f"CAUTION: The current staging run does not meet the improvement threshold of at least +{min_model_perf_threshold}. Transition the staging model to production with caution.") 357 | else: 358 | print(f"ALERT: The current staging run underperforms by {model_diff_percent}% when compared to the production model. Do not transition the staging model to production.") 359 | 360 | 361 | # COMMAND ---------- 362 | 363 | def plot_boxplots(unique_feature_diff_array, reference_pdf, new_pdf): 364 | """ 365 | Plot boxplots comparing the distributions of unique features between incoming data and production data. 366 | 367 | Args: 368 | unique_feature_diff_array (list): List of unique feature names to compare. 369 | reference_pdf (pandas.DataFrame): Reference production data. 370 | new_pdf (pandas.DataFrame): New incoming data. 371 | 372 | Returns: 373 | matplotlib.figure.Figure: The generated figure. 374 | 375 | Raises: 376 | None 377 | 378 | """ 379 | # Set the theme of the plots. 380 | sns.set_theme(style="whitegrid") 381 | 382 | # Calculate the number of columns. 383 | num_columns = len(unique_feature_diff_array) 384 | 385 | # Create a figure and axes. 386 | fig, axes = plt.subplots(1, num_columns, figsize=(5*num_columns, 5)) 387 | 388 | # Set the title of the figure. 389 | fig.suptitle("Distribution Comparisons between Incoming Data and Production Data") 390 | 391 | # Plot boxplots for each column name side by side. 392 | for i, column_name in enumerate(unique_feature_diff_array): 393 | ax = axes[i] if num_columns > 1 else axes # Access the correct subplot. 394 | ax.boxplot([reference_pdf[column_name], new_pdf[column_name]]) 395 | ax.set_xticklabels(['Production Data', 'New Incoming Data']) 396 | ax.set_title(column_name) 397 | 398 | # Set common y-axis label. 399 | fig.text(0.04, 0.5, 'Value', va='center', rotation='vertical') 400 | 401 | # Set plot title. 402 | plt.suptitle('Boxplot Comparison') 403 | 404 | plt.close() 405 | 406 | # Return the generated figure. 407 | return fig 408 | 409 | -------------------------------------------------------------------------------- /Chapter-06/model-registry-and-webhooks.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC 3 | # MAGIC %md 4 | # MAGIC ## Author 5 | # MAGIC 6 | # MAGIC - **Debu Sinha** 7 | # MAGIC 8 | # MAGIC ## Tested Environment 9 | # MAGIC 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 13 | # MAGIC 14 | # MAGIC ## Cluster Setup Instructions 15 | # MAGIC 16 | # MAGIC 1. **Create a Cluster**: 17 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 18 | # MAGIC - Under `Policy`, select `Unrestricted`. 19 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 20 | # MAGIC - In `Cluster Mode`, select `Single Node`. 21 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 22 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 23 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 24 | # MAGIC 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 26 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 27 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 28 | # MAGIC 29 | # MAGIC ## MLflow Model Registry API 30 | # MAGIC 31 | # MAGIC This section demonstrates how to register a model in the registry and request its transition to the staging environment. 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md 36 | # MAGIC ### Retrieving the Most Recently Updated Experiment from the MLflow Server 37 | # MAGIC 38 | # MAGIC In this code snippet, several key tasks are carried out: 39 | # MAGIC 40 | # MAGIC 1. **Initialize MLflow Client**: 41 | # MAGIC - The MLflow tracking client is initialized to interact with the MLflow server. 42 | # MAGIC 43 | # MAGIC 2. **Fetch Available Experiments**: 44 | # MAGIC - A list of all available experiments is fetched using the `search_experiments()` method of the client. 45 | # MAGIC 46 | # MAGIC 3. **Sort Experiments by Last Update Time**: 47 | # MAGIC - The fetched experiments are sorted based on their last update time in descending order, ensuring that the most recently modified experiment comes first. 48 | # MAGIC 49 | # MAGIC 4. **Retrieve Latest Experiment**: 50 | # MAGIC - The most recently updated experiment is then extracted from the sorted list and stored in the `latest_experiment` variable. 51 | # MAGIC 52 | # MAGIC 5. **Display Experiment Name**: 53 | # MAGIC - The name of the most recently updated experiment is printed out for confirmation. 54 | # MAGIC 55 | # MAGIC > **Note**: If you are specifically interested in the experiment related to AutoML for base model creation, make sure that the `latest_experiment` actually corresponds to that particular experiment. 56 | # MAGIC 57 | 58 | # COMMAND ---------- 59 | 60 | import mlflow 61 | 62 | # Initialize the MLflow client 63 | client = mlflow.tracking.MlflowClient() 64 | 65 | # Fetch all available experiments 66 | experiments = client.search_experiments() 67 | 68 | # Sort the experiments by their last update time in descending order 69 | sorted_experiments = sorted(experiments, key=lambda x: x.last_update_time, reverse=True) 70 | 71 | # Retrieve the most recently updated experiment 72 | latest_experiment = sorted_experiments[0] 73 | 74 | # Output the name of the latest experiment 75 | print(f"The most recently updated experiment is named '{latest_experiment.name}'.") 76 | 77 | # Note: If you're specifically looking for the experiment related to AutoML for base model creation, 78 | # ensure that 'latest_experiment' corresponds to that experiment. 79 | 80 | # COMMAND ---------- 81 | 82 | # MAGIC %md 83 | # MAGIC ### Identifying the Best Model Run ID from a Specific Experiment in MLflow 84 | # MAGIC 85 | # MAGIC In this code snippet, the objective is multi-fold: 86 | # MAGIC 87 | # MAGIC 1. **Fetch Current User's Username**: 88 | # MAGIC - Utilizes Databricks utilities to programmatically fetch the username. This could be useful for traceability or logging purposes. 89 | # MAGIC 90 | # MAGIC 2. **Set Experiment and Model Names**: 91 | # MAGIC - Retrieves the name of the most recently updated experiment, assumed to have been set in earlier steps. 92 | # MAGIC - Defines a specific name for the model in the registry, which in this case is "Churn Prediction Bank". 93 | # MAGIC 94 | # MAGIC 3. **Fetch and Sort Experiment Runs**: 95 | # MAGIC - Retrieves the details of the experiment using its name. 96 | # MAGIC - Searches for all runs within the experiment and sorts them based on the F1 score on the validation set, in descending order. 97 | # MAGIC 98 | # MAGIC 4. **Identify the Best Model Run ID**: 99 | # MAGIC - The run ID corresponding to the highest validation F1 score is then stored in the `best_run_id` variable. 100 | # MAGIC 101 | # MAGIC > **Note**: The `best_run_id` variable now holds the run ID of the model that performed best in the specified experiment, according to the F1 score on the validation set. 102 | # MAGIC 103 | # MAGIC 104 | 105 | # COMMAND ---------- 106 | 107 | # Initialize the Databricks utilities to programmatically fetch the username 108 | username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() 109 | 110 | # Retrieve the name of the latest experiment; assumed to have been set in earlier steps 111 | experiment_name = latest_experiment.name 112 | 113 | # Define the model name for the registry, specific to our use-case of Churn Prediction for a Bank 114 | registry_model_name = "Churn Prediction Bank" 115 | 116 | # Fetch the experiment details using its name 117 | experiment_details = client.get_experiment_by_name(experiment_name) 118 | 119 | # Search for runs within the experiment and sort them by validation F1 score in descending order 120 | sorted_runs = mlflow.search_runs(experiment_details.experiment_id).sort_values("metrics.val_f1_score", ascending=False) 121 | 122 | # Get the run ID of the best model based on the highest validation F1 score 123 | best_run_id = sorted_runs.loc[0, "run_id"] 124 | 125 | best_run_id 126 | # Note: The variable `best_run_id` now contains the run ID of the best model in the specified experiment 127 | 128 | # COMMAND ---------- 129 | 130 | # MAGIC %md 131 | # MAGIC ### Registering the Best Model in MLflow's Model Registry 132 | # MAGIC 133 | # MAGIC The aim of this code block is to register the best-performing model (based on the highest validation F1 score) in MLflow's model registry. Here's how it does it: 134 | # MAGIC 135 | # MAGIC 1. **Initialize Model URI**: 136 | # MAGIC - Constructs the model URI using the `best_run_id` obtained from previous steps. The URI will uniquely identify the model's location. 137 | # MAGIC 138 | # MAGIC 2. **Attempt Model Registration**: 139 | # MAGIC - Tries to register the model under the name specified by `registry_model_name`. 140 | # MAGIC 141 | # MAGIC 3. **Success and Failure Scenarios**: 142 | # MAGIC - Prints a success message along with the model URI if the model registration is successful. 143 | # MAGIC - Captures and prints an error message if it fails to register the model. 144 | # MAGIC 145 | # MAGIC > **Note**: The `model_details` variable will be populated with details about the registered model if the registration is successful. These details include the model name, version, and other metadata. 146 | # MAGIC 147 | 148 | # COMMAND ---------- 149 | 150 | # Initialize the model's URI using the best run ID obtained from previous steps 151 | model_uri = f"runs:/{best_run_id}/model" 152 | 153 | # Register the model in MLflow's model registry under the specified name 154 | try: 155 | model_details = mlflow.register_model(model_uri=model_uri, name=registry_model_name) 156 | print(f"Successfully registered model '{registry_model_name}' with URI '{model_uri}'.") 157 | except mlflow.exceptions.MlflowException as e: 158 | print(f"Failed to register model '{registry_model_name}': {str(e)}") 159 | 160 | model_details 161 | # Note: The variable `model_details` now contains details about the registered model 162 | 163 | # COMMAND ---------- 164 | 165 | # MAGIC %md 166 | # MAGIC ### Updating Model Metadata in the MLflow Model Registry 167 | # MAGIC 168 | # MAGIC In this step, we accomplish two primary tasks: 169 | # MAGIC 170 | # MAGIC 1. **Update Registered Model Metadata**: 171 | # MAGIC - We attempt to update the description of an already registered model in the MLflow Model Registry. 172 | # MAGIC - The description aims to clarify the purpose of the model, in this case, "This model predicts whether a bank customer will churn or not." 173 | # MAGIC 174 | # MAGIC 2. **Update Version-Specific Metadata**: 175 | # MAGIC - We update the metadata for a specific version of the model. 176 | # MAGIC - Here, we add a description specifying that this model version is based on scikit-learn. 177 | # MAGIC 178 | # MAGIC Both operations are wrapped in try-except blocks for robust error handling. Should any operation fail, an error message will be printed to provide insight into the failure. 179 | # MAGIC 180 | # MAGIC > **Note**: The `model_details` variable is assumed to contain essential information about the registered model and its specific version. 181 | # MAGIC 182 | 183 | # COMMAND ---------- 184 | 185 | # Update the metadata of an already registered model 186 | try: 187 | client.update_registered_model( 188 | name=model_details.name, 189 | description="This model predicts whether a bank customer will churn or not." 190 | ) 191 | print(f"Successfully updated the description for the registered model '{model_details.name}'.") 192 | except mlflow.exceptions.MlflowException as e: 193 | print(f"Failed to update the registered model '{model_details.name}': {str(e)}") 194 | 195 | # Update the metadata for a specific version of the model 196 | try: 197 | client.update_model_version( 198 | name=model_details.name, 199 | version=model_details.version, 200 | description="This is a scikit-learn based model." 201 | ) 202 | print(f"Successfully updated the description for version {model_details.version} of the model '{model_details.name}'.") 203 | except mlflow.exceptions.MlflowException as e: 204 | print(f"Failed to update version {model_details.version} of the model '{model_details.name}': {str(e)}") 205 | 206 | # Note: The `model_details` variable is assumed to contain details about the registered model and its version 207 | 208 | # COMMAND ---------- 209 | 210 | # MAGIC %md 211 | # MAGIC ### Transitioning Model Version to 'Staging' Stage in the MLflow Model Registry 212 | # MAGIC 213 | # MAGIC In this step, the following objectives are met: 214 | # MAGIC 215 | # MAGIC 1. **Transition Model Version**: 216 | # MAGIC - We aim to transition a specific version of the registered model to the 'Staging' stage in the MLflow Model Registry. 217 | # MAGIC 218 | # MAGIC 2. **Archiving Existing Versions**: 219 | # MAGIC - The `archive_existing_versions=True` flag ensures that any pre-existing versions of the model in the 'Staging' stage are archived. This helps in keeping only the most relevant version in the stage. 220 | # MAGIC 221 | # MAGIC 3. **Error Handling**: 222 | # MAGIC - The operation is wrapped in a try-except block. If the transition operation fails for any reason, a detailed error message will be displayed to help diagnose the issue. 223 | # MAGIC 224 | # MAGIC > **Note**: Successful completion will print a message confirming the successful transition of the model version to the 'Staging' stage. 225 | # MAGIC 226 | # MAGIC 227 | 228 | # COMMAND ---------- 229 | 230 | # Transition the model version to the 'Staging' stage in the model registry 231 | try: 232 | client.transition_model_version_stage( 233 | name=model_details.name, 234 | version=model_details.version, 235 | stage="Staging", 236 | archive_existing_versions=True # Archives any existing versions in the 'Staging' stage 237 | ) 238 | print(f"Successfully transitioned version {model_details.version} of the model '{model_details.name}' to 'Staging'.") 239 | except mlflow.exceptions.MlflowException as e: 240 | print(f"Failed to transition version {model_details.version} of the model '{model_details.name}' to 'Staging': {str(e)}") 241 | 242 | 243 | # COMMAND ---------- 244 | 245 | # MAGIC %md 246 | # MAGIC ### Model Registry Webhooks 247 | # MAGIC 248 | # MAGIC ### Supported Events 249 | # MAGIC * **MODEL_VERSION_CREATED**: A new model version was created for the associated model. 250 | # MAGIC * **MODEL_VERSION_TRANSITIONED_STAGE**: A model version’s stage was changed. 251 | # MAGIC * **TRANSITION_REQUEST_CREATED**: A user requested a model version’s stage be transitioned. 252 | # MAGIC * **COMMENT_CREATED**: A user wrote a comment on a registered model. 253 | # MAGIC * **REGISTERED_MODEL_CREATED**: A new registered model was created. This event type can only be specified for a registry-wide webhook, which can be created by not specifying a model name in the create request. 254 | # MAGIC * **MODEL_VERSION_TAG_SET**: A user set a tag on the model version. 255 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_STAGING**: A model version was transitioned to staging. 256 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_PRODUCTION**: A model version was transitioned to production. 257 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_ARCHIVED**: A model version was archived. 258 | # MAGIC * **TRANSITION_REQUEST_TO_STAGING_CREATED**: A user requested a model version be transitioned to staging. 259 | # MAGIC * **TRANSITION_REQUEST_TO_PRODUCTION_CREATED**: A user requested a model version be transitioned to production. 260 | # MAGIC * **TRANSITION_REQUEST_TO_ARCHIVED_CREATED**: A user requested a model version be archived. 261 | # MAGIC 262 | # MAGIC ### Types of webhooks 263 | # MAGIC * **HTTP webhook** — send triggers to endpoints of your choosing such as slack, AWS Lambda, Azure Functions, or GCP Cloud Functions 264 | # MAGIC * **Job webhook** — trigger a job within the Databricks workspace 265 | 266 | # COMMAND ---------- 267 | 268 | # MAGIC %md 269 | # MAGIC ## MLflow Endpoint Utility Functions 270 | # MAGIC 271 | # MAGIC This script contains utility functions to interact with MLflow REST API endpoints. The code imports necessary modules, initializes an MLflow client, and defines a series of functions to handle REST API calls. Below are the key components: 272 | # MAGIC 273 | # MAGIC ### Import Statements 274 | # MAGIC 275 | # MAGIC - `http_request from mlflow.utils.rest_utils`: Required for making HTTP requests to the MLflow server. 276 | # MAGIC - `json`: Standard library for handling JSON formatted data. 277 | # MAGIC 278 | # MAGIC ### Functions 279 | # MAGIC 280 | # MAGIC #### `get_mlflow_client()` 281 | # MAGIC - **Purpose**: Returns an initialized MLflowClient object for further operations. 282 | # MAGIC - **Return Type**: `MlflowClient` 283 | # MAGIC 284 | # MAGIC #### `get_host_creds(client)` 285 | # MAGIC - **Parameters**: `client` - Initialized MlflowClient object. 286 | # MAGIC - **Purpose**: Fetches the host and token credentials from the MLflow tracking server. 287 | # MAGIC - **Return Type**: Host and token credentials. 288 | # MAGIC 289 | # MAGIC #### `mlflow_call_endpoint(endpoint, method, body='{}')` 290 | # MAGIC - **Parameters**: 291 | # MAGIC - `endpoint` (str): The MLflow API endpoint to call. 292 | # MAGIC - `method` (str): HTTP method to use ('GET' or other HTTP methods). 293 | # MAGIC - `body` (str, optional): JSON-formatted request payload, default is an empty JSON object. 294 | # MAGIC - **Purpose**: Makes a REST API call to the specified MLflow endpoint. 295 | # MAGIC - **Return Type**: Dictionary containing the JSON response from the API call or `None` if the request fails. 296 | # MAGIC - **Error Handling**: Captures exceptions and prints an error message detailing the failure. 297 | # MAGIC 298 | # MAGIC ### Client Initialization and Credential Retrieval 299 | # MAGIC 300 | # MAGIC After defining the functions, the script initializes an `MlflowClient` object and fetches the host and token credentials. 301 | # MAGIC 302 | # MAGIC - `client = get_mlflow_client()`: Initializes the client. 303 | # MAGIC - `host_creds = get_host_creds(client)`: Retrieves host and token credentials. 304 | # MAGIC - `host = host_creds.host`: Extracts the host. 305 | # MAGIC - `token = host_creds.token`: Extracts the token. 306 | # MAGIC 307 | # MAGIC 308 | 309 | # COMMAND ---------- 310 | 311 | from mlflow.utils.rest_utils import http_request 312 | import json 313 | 314 | def get_mlflow_client(): 315 | """Returns an initialized MLflowClient object.""" 316 | return mlflow.tracking.client.MlflowClient() 317 | 318 | def get_host_creds(client): 319 | """Fetches host and token credentials.""" 320 | return client._tracking_client.store.get_host_creds() 321 | 322 | def mlflow_call_endpoint(endpoint, method, body='{}'): 323 | """Calls an MLflow REST API endpoint. 324 | 325 | Parameters: 326 | endpoint (str): The endpoint to call. 327 | method (str): HTTP method ('GET' or other HTTP methods). 328 | body (str): JSON-formatted request payload. 329 | 330 | Returns: 331 | dict: JSON response as a dictionary. 332 | """ 333 | host_creds = get_host_creds(get_mlflow_client()) 334 | 335 | try: 336 | if method == 'GET': 337 | response = http_request( 338 | host_creds=host_creds, 339 | endpoint=f"/api/2.0/mlflow/{endpoint}", 340 | method=method, 341 | params=json.loads(body) 342 | ) 343 | else: 344 | response = http_request( 345 | host_creds=host_creds, 346 | endpoint=f"/api/2.0/mlflow/{endpoint}", 347 | method=method, 348 | json=json.loads(body) 349 | ) 350 | 351 | return response.json() 352 | 353 | except Exception as e: 354 | print(f"Failed to call MLflow endpoint '{endpoint}': {str(e)}") 355 | return None 356 | 357 | 358 | client = get_mlflow_client() 359 | host_creds = get_host_creds(client) 360 | host = host_creds.host 361 | token = host_creds.token 362 | 363 | # COMMAND ---------- 364 | 365 | # MAGIC %md 366 | # MAGIC ### Setting Up Slack Notifications and Webhooks 367 | # MAGIC 368 | # MAGIC You can read more about Slack webhooks [here](https://api.slack.com/messaging/webhooks#create_a_webhook). 369 | # MAGIC 370 | # MAGIC First, we set up a webhook to notify us whenever a **New model version is created**. 371 | # MAGIC 372 | # MAGIC In the next cell assign the slack_webhook variable the link to your webhook. It should look as follows`"https://hooks.slack.com/services/?????????/??????????/????????????????????????"` 373 | 374 | # COMMAND ---------- 375 | 376 | slack_webhook = "https://hooks.slack.com/services/?????????/??????????/???????????????????????" 377 | 378 | # COMMAND ---------- 379 | 380 | import json 381 | 382 | trigger_for_slack = json.dumps({ 383 | "model_name": registry_model_name, 384 | "events": ["MODEL_VERSION_CREATED"], 385 | "description": "Triggered when a new model version is created.", 386 | "http_url_spec": { 387 | "url": slack_webhook 388 | } 389 | }) 390 | 391 | mlflow_call_endpoint("registry-webhooks/create", method = "POST", body = trigger_for_slack) 392 | 393 | # COMMAND ---------- 394 | 395 | # MAGIC %md 396 | # MAGIC Similarly we can create a webhook that notifies us when a **New transition request is made for a mode version**. 397 | 398 | # COMMAND ---------- 399 | 400 | trigger_for_slack = json.dumps({ 401 | "model_name": registry_model_name, 402 | "events": ["TRANSITION_REQUEST_CREATED"], 403 | "description": "Triggered when a new transition request for a model has been made.", 404 | "http_url_spec": { 405 | "url": slack_webhook 406 | } 407 | }) 408 | 409 | mlflow_call_endpoint("registry-webhooks/create", method = "POST", body = trigger_for_slack) 410 | 411 | # COMMAND ---------- 412 | 413 | # MAGIC %md 414 | # MAGIC ### Listing all webhooks. 415 | 416 | # COMMAND ---------- 417 | 418 | list_model_webhooks = json.dumps({"model_name": registry_model_name}) 419 | 420 | model_webhooks = mlflow_call_endpoint("registry-webhooks/list", method = "GET", body = list_model_webhooks) 421 | model_webhooks 422 | 423 | # COMMAND ---------- 424 | 425 | # MAGIC %md 426 | # MAGIC You can also **delete webhooks**. 427 | # MAGIC 428 | # MAGIC You can use the below cell to delete webhooks by ID or delete all the webhooks for a specific model. 429 | 430 | # COMMAND ---------- 431 | 432 | # for webhook in model_webhooks["webhooks"]: 433 | # mlflow_call_endpoint( 434 | # "registry-webhooks/delete", 435 | # method="DELETE", 436 | # body=json.dumps({'id': webhook["id"]}) 437 | # ) 438 | 439 | # COMMAND ---------- 440 | 441 | 442 | -------------------------------------------------------------------------------- /Chapter-09/model-drift.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC %md 4 | # MAGIC ## Author 5 | # MAGIC 6 | # MAGIC - **Debu Sinha** 7 | # MAGIC 8 | # MAGIC ## Tested Environment 9 | # MAGIC 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above. 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU. 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here. 13 | # MAGIC 14 | # MAGIC ## Cluster Setup Instructions 15 | # MAGIC 16 | # MAGIC 1. **Create a Cluster**: 17 | # MAGIC - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`. 18 | # MAGIC - Under `Policy`, select `Unrestricted`. 19 | # MAGIC - Enter a name for your cluster, for example, `demo`, into the cluster name text box. 20 | # MAGIC - In `Cluster Mode`, select `Single Node`. 21 | # MAGIC - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab. 22 | # MAGIC - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__. 23 | # MAGIC - Click on `Create Cluster` and wait for your cluster to be provisioned. 24 | # MAGIC 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 26 | # MAGIC - Click on the menu labeled `Detached` at the top left of this workbook. 27 | # MAGIC - Select your cluster name to attach this notebook to your cluster. 28 | # MAGIC -------- 29 | # MAGIC ### Outline 30 | # MAGIC 31 | # MAGIC We simulate a batch inference scenario where we train, deploy, and maintain a model to predict monthly Sales for ecommerce website on monthly basis. 32 | # MAGIC 33 | # MAGIC **Data interval**: Arrives monthly
34 | # MAGIC **Date range**: 01/01/2023 - 03/31/2023 35 | # MAGIC 36 | # MAGIC **Workflow**: 37 | # MAGIC * Load the new month of incoming data 38 | # MAGIC * Apply incoming data checks 39 | # MAGIC * Error and drift evaluation 40 | # MAGIC * Identify and address any errors and drifts 41 | # MAGIC * Train a new model 42 | # MAGIC * Apply model validation checks versus the existing model in production 43 | # MAGIC * If checks pass, deploy the new candidate model to production 44 | # MAGIC * If checks fail, do not deploy the new candidate model
45 | # MAGIC 46 | # MAGIC **Reproducibility Tools**: 47 | # MAGIC * [MLflow](https://www.mlflow.org/docs/latest/index.html) for model parameters, metrics, and artifacts 48 | # MAGIC * [Delta](https://docs.delta.io/latest/index.html) for data versioning
49 | # MAGIC 50 | # MAGIC Although this notebook specifically addresses tests to monitor a supervised ML model for batch inference, the same tests are applicable in streaming and real-time settings. 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %md 55 | # MAGIC 56 | # MAGIC ### Run setup and utils notebooks 57 | 58 | # COMMAND ---------- 59 | 60 | # MAGIC %run ./config/setup 61 | 62 | # COMMAND ---------- 63 | 64 | # MAGIC %run ./util/training 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %run ./data/datagen 69 | 70 | # COMMAND ---------- 71 | 72 | # MAGIC %run ./util/monitoring 73 | 74 | # COMMAND ---------- 75 | 76 | # Remove all existing widgets 77 | dbutils.widgets.removeAll() 78 | 79 | # Create three widgets for the stats threshold limit, p-threshold, and min model R2 threshold 80 | dbutils.widgets.text("stats_threshold_limit", "0.5") 81 | dbutils.widgets.text("p_threshold", "0.05") 82 | dbutils.widgets.text("min_model_r2_threshold", "0.005") 83 | 84 | # Get the values of the widgets 85 | # stats_threshold_limit: how much we should allow basic summary stats to shift 86 | stats_threshold_limit = float(dbutils.widgets.get("stats_threshold_limit")) 87 | 88 | # p_threshold: the p-value below which to reject null hypothesis 89 | p_threshold = float(dbutils.widgets.get("p_threshold")) 90 | 91 | # min_model_r2_threshold: minimum model improvement 92 | min_model_r2_threshold = float(dbutils.widgets.get("min_model_r2_threshold")) 93 | 94 | # COMMAND ---------- 95 | 96 | # MAGIC %md 97 | # MAGIC # Model Drift Dummy Dataset 98 | 99 | # COMMAND ---------- 100 | 101 | # MAGIC %md 102 | # MAGIC 103 | # MAGIC 104 | # MAGIC ### Month 1 - Base line Data 105 | # MAGIC 106 | # MAGIC We have generated a dummy dataset to showcase model drift. The dataset consists of time series data for three months. The independent features of the dataset are: 107 | # MAGIC 108 | # MAGIC | Feature | Type | Description | 109 | # MAGIC |---|---|---| 110 | # MAGIC | Date | date | The date for which the record belongs. | 111 | # MAGIC | Temperature | numeric | The highest daily temperature in Fahrenheit. | 112 | # MAGIC | Weather_Condition | categorical | The weather condition, which can be sunny, cloudy, or rainy. | 113 | # MAGIC | Promotion_Type | categorical | The type of promotion, which can be a discount, free gift, or bundle deal. | 114 | # MAGIC | Website_Traffic | numeric | The total website traffic. | 115 | # MAGIC | Device_Type | categorical | The type of device used to access the website, which can be mobile, desktop, or tablet. | 116 | # MAGIC 117 | # MAGIC The target variable of the dataset is Daily_Sales (numeric). Daily_Sales has the following correlations with the independent features for the first month: 118 | # MAGIC 119 | # MAGIC * Positive correlation with Temperature and Website_Traffic. 120 | # MAGIC * Negative correlation with Weather_Condition and Device_Type. 121 | # MAGIC 122 | # MAGIC ### Data and Model Management 123 | # MAGIC 124 | # MAGIC #### Variables 125 | # MAGIC 126 | # MAGIC The following variables are also defined during our setup to help with execution down the line: 127 | # MAGIC 128 | # MAGIC Variable | Description 129 | # MAGIC ---|--- 130 | # MAGIC `project_home_dir` | The path to the project home directory. 131 | # MAGIC `raw_good_data_path` | The path to the directory where the raw data is stored as csv. 132 | # MAGIC `raw_month2_bad_data_path` | The path to the directory where the bad data for simulating feature drift is stored as csv. 133 | # MAGIC `months_gold_path` | The path to the directory where the clean and processed data is stored in Delta format. 134 | # MAGIC `mlflow_experiment_name` | The name of the MLflow experiment where the model will be registered. 135 | # MAGIC `mlflow_experiment_path` | The path relative to our home directory in the workspace where the experiment will be located. 136 | # MAGIC 137 | # MAGIC 138 | # MAGIC 139 | # MAGIC 140 | 141 | # COMMAND ---------- 142 | 143 | print(f'good raw data file location : {raw_good_data_path}') 144 | print(f'bad raw data location : {raw_month2_bad_data_path}') 145 | print(f'Gold Delta table path : {months_gold_path}') 146 | print(f'MLflow experiment name : {mlflow_experiment_name}') 147 | print(f'MLflow experiment path : {mlflow_experiment_path}') 148 | 149 | # COMMAND ---------- 150 | 151 | # MAGIC %md 152 | # MAGIC #### i. Initial Data load 153 | # MAGIC 154 | # MAGIC Load the first month of data which we use to train and evaluate our first model. 155 | # MAGIC 156 | # MAGIC We create a "Gold" table to which we will be appending each subsequent month of data. 157 | # MAGIC 158 | 159 | # COMMAND ---------- 160 | 161 | # Ensure we start with no existing Delta table 162 | dbutils.fs.rm(months_gold_path, True) 163 | 164 | # Incoming Month 1 Data 165 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True) 166 | 167 | # Filter the DataFrame to only include data for January 2023 168 | raw_data_month1 = raw_data.filter(raw_data["Date"].between("2023-01-01", "2023-01-31")) 169 | 170 | # Print the filtered DataFrame 171 | raw_data_month1.show() 172 | 173 | # COMMAND ---------- 174 | 175 | import pyspark.sql.functions as F 176 | # Create inital version of the Gold Delta table we will use for training - this will be updated with subsequent "months" of data 177 | raw_data_month1.withColumn("month", F.lit("month_1")).write.format("delta").mode("overwrite").partitionBy("month").save(months_gold_path) 178 | 179 | # COMMAND ---------- 180 | 181 | #list files in the gold delta table path 182 | display(dbutils.fs.ls(months_gold_path)) 183 | 184 | # COMMAND ---------- 185 | 186 | # MAGIC %md 187 | # MAGIC #### ii. Model Training 188 | 189 | # COMMAND ---------- 190 | 191 | #read gold data for month 1 from the Delta table 192 | month1_gold_delta_table = DeltaTable.forPath(spark, path=months_gold_path) 193 | month1_gold_df = month1_gold_delta_table.toDF() 194 | 195 | # Set the month number - used for naming the MLflow run and tracked as a parameter 196 | month = 1 197 | 198 | # Specify name of MLflow run 199 | run_name = f"month_{month}" 200 | 201 | target_col = "Daily_Sales" 202 | cat_cols = [col[0] for col in month1_gold_df.dtypes if col[1]=="string" and col[0]!='month'] 203 | num_cols= [col[0] for col in month1_gold_df.dtypes if ((col[1]=="int" or col[1]=="double") and col[0]!="Daily_Sales") ] 204 | 205 | print(f"category columns : {cat_cols}") 206 | print(f"numeric columns : {num_cols}") 207 | print(f"target column : {target_col}") 208 | 209 | # Define the parameters to pass in the RandomForestRegressor model 210 | model_params = {"n_estimators": 500, 211 | "max_depth": 5, 212 | "max_features": "log2"} 213 | 214 | # Define a dictionary of parameters that we would like to use during preprocessing 215 | misc_params = {"month": month, 216 | "target_col": target_col, 217 | "cat_cols": cat_cols, 218 | "num_cols": num_cols} 219 | 220 | # COMMAND ---------- 221 | 222 | # Trigger model training and logging to MLflow 223 | month1_run = train_sklearn_rf_model(run_name, 224 | months_gold_path, 225 | model_params, 226 | misc_params) 227 | 228 | 229 | month_1_run_id = month1_run.info.run_id 230 | 231 | # COMMAND ---------- 232 | 233 | # MAGIC %md 234 | # MAGIC 235 | # MAGIC #### iii. Model Deployment 236 | # MAGIC We first register the model to the MLflow Model Registry. For demonstration purposes, we will immediately transition the model to the "Production" stage in the MLflow Model Registry. However, in a real-world scenario, one should have a robust model validation process in place prior to migrating a model to Production. 237 | # MAGIC 238 | # MAGIC We will demonstrate a multi-stage approach in the subsequent sections: 239 | # MAGIC 1. Transitioning the model to the "Staging" stage. 240 | # MAGIC 2. Conducting model validation checks. 241 | # MAGIC 3. Only then, triggering a transition from Staging to Production once these checks are satisfied. 242 | # MAGIC 243 | # MAGIC 244 | 245 | # COMMAND ---------- 246 | 247 | # Register model to MLflow Model Registry 248 | month_1_model_version = mlflow.register_model(model_uri=f"runs:/{month_1_run_id}/model", name=mlflow_experiment_name) 249 | 250 | # COMMAND ---------- 251 | 252 | # Transition model to Production 253 | month_1_model_version = transition_model(month_1_model_version, stage="Production") 254 | print(month_1_model_version) 255 | 256 | # COMMAND ---------- 257 | 258 | # MAGIC %md 259 | # MAGIC 260 | # MAGIC ### Month 2 - Arrival of New Data 261 | # MAGIC 262 | # MAGIC After deploying our model for a month, we are now faced with the arrival of a fresh month's worth of data. Let's explore two scenarios related to this new data: 263 | # MAGIC 264 | # MAGIC **Scenario 1: Missing values in website_traffic** 265 | # MAGIC An updated upstream Data cleaning process has a bug causing the the value of, `website_traffic` counts for promotion type `bundle_deal` and `free_gift` to be empty. 266 | # MAGIC 267 | # MAGIC **Scenario 2: Introduction of new measurement for temperature** 268 | # MAGIC Also during the upstream data generation procedure a the temperature values are now being captured in __Celcius__ rather than in __Fahrenheit__. 269 | # MAGIC 270 | # MAGIC **What are we simulating here?** 271 | # MAGIC In this scenario, we are simulating two important factors: 272 | # MAGIC - Feature drift: The characteristics of the data have changed over time, specifically with missing `website_traffic` entries for `bundle_deal` and `free_gift`. 273 | # MAGIC - Upstream data errors: Unexpected changes or additions in the data generation process, such as the introduction of a different unit of measuring temperature. 274 | 275 | # COMMAND ---------- 276 | 277 | # MAGIC %md 278 | # MAGIC #### i. Feature checks prior to model training 279 | # MAGIC 280 | # MAGIC **All features** 281 | # MAGIC * Null checks 282 | # MAGIC 283 | # MAGIC **Numeric features** 284 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum 285 | # MAGIC * Distribution checks 286 | # MAGIC 287 | # MAGIC **Categorical features** 288 | # MAGIC * Check expected count for each level 289 | # MAGIC * Check the mode 290 | 291 | # COMMAND ---------- 292 | 293 | 294 | # Incoming Month 2 Data 295 | raw_data_month2 = spark.read.csv(raw_month2_bad_data_path, header=True, inferSchema=True) 296 | 297 | # Filter the DataFrame to only include data for Feb 2023 298 | raw_data_month2 = raw_data_month2.filter(raw_data_month2["Date"].between("2023-02-01", "2023-02-28")) 299 | 300 | # Print the filtered DataFrame 301 | raw_data_month2.show(5) 302 | 303 | # COMMAND ---------- 304 | 305 | # Compute summary statistics on new incoming data 306 | # we will keep only the columns that we monitored for the last mode training data 307 | # convert to pandas dataframe should be used with care as if the size of data is larger than what can fit on driver node then this can cause failures. 308 | # In the case of data size being large use proper sampling technique to estimate population summary statistics. 309 | month_2_pdf = raw_data_month2.toPandas().drop(['Date'], axis=1) 310 | summary_stats_month_2_pdf = calculate_summary_stats(month_2_pdf) 311 | summary_stats_month_2_pdf 312 | 313 | # COMMAND ---------- 314 | 315 | # Get the original MLflow run associated with the model registered under Production 316 | current_prod_run = get_run_from_registered_model(mlflow_experiment_name, stage="Production") 317 | 318 | # Load in original versions of Delta table used at training time for current Production model 319 | current_prod_pdf = load_delta_table_from_run(current_prod_run).toPandas() 320 | 321 | # Load summary statistics pandas DataFrame for data which the model currently in Production was trained and evaluated against 322 | current_prod_stats_pdf = load_summary_stats_pdf_from_run(current_prod_run, project_local_tmp_dir) 323 | current_prod_stats_pdf 324 | 325 | # COMMAND ---------- 326 | 327 | # MAGIC %md 328 | # MAGIC 329 | # MAGIC **All features** 330 | # MAGIC * Null checks 331 | 332 | # COMMAND ---------- 333 | 334 | print("\nCHECKING PROPORTION OF NULLS.....") 335 | check_null_proportion(month_2_pdf, null_proportion_threshold=.5) 336 | 337 | # COMMAND ---------- 338 | 339 | # MAGIC %md 340 | # MAGIC **Numeric features** 341 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum 342 | # MAGIC * Distribution checks 343 | 344 | # COMMAND ---------- 345 | 346 | statistic_list = ["mean", "median", "std", "min", "max"] 347 | 348 | unique_feature_diff_array_month_2 = check_diff_in_summary_stats(summary_stats_month_2_pdf, 349 | current_prod_stats_pdf, 350 | num_cols + [target_col], 351 | stats_threshold_limit, 352 | statistic_list) 353 | 354 | unique_feature_diff_array_month_2 355 | 356 | # COMMAND ---------- 357 | 358 | print(f"Let's look at the box plots of the features that exceed the stats_threshold_limit of {stats_threshold_limit}") 359 | plot_boxplots(unique_feature_diff_array_month_2, current_prod_pdf, month_2_pdf) 360 | 361 | # COMMAND ---------- 362 | 363 | print("\nCHECKING VARIANCES WITH LEVENE TEST.....") 364 | check_diff_in_variances(current_prod_pdf, month_2_pdf, num_cols, p_threshold) 365 | 366 | print("\nCHECKING KS TEST.....") 367 | check_dist_ks_bonferroni_test(current_prod_pdf, month_2_pdf, num_cols + [target_col], p_threshold) 368 | 369 | # COMMAND ---------- 370 | 371 | # MAGIC %md 372 | # MAGIC 373 | # MAGIC **Categorical features** 374 | # MAGIC * Check expected count for each level 375 | # MAGIC * Check the mode 376 | 377 | # COMMAND ---------- 378 | 379 | check_categorical_diffs(current_prod_pdf, month_2_pdf, cat_cols, p_threshold) 380 | 381 | # COMMAND ---------- 382 | 383 | # MAGIC %md 384 | # MAGIC 385 | # MAGIC **`Action`: Resolve Data issues** 386 | # MAGIC 387 | # MAGIC After identifying data issues with `Temperature` and `Website_Traffic` and collaborating with the upstream data processing team, we have successfully resolved these issues. The fixed data for the new month is incorporated into our Gold Delta table. Subsequently, we proceed with training on the updated dataset to leverage the newly available information. 388 | 389 | # COMMAND ---------- 390 | 391 | # Incoming corrected Data 392 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True) 393 | 394 | # Filter the DataFrame to only include data for January 2023 395 | raw_data_month2 = raw_data.filter(raw_data["Date"].between("2023-02-01", "2023-02-28")) 396 | 397 | # Append new month of data to Gold Delta table to use for training 398 | raw_data_month2.withColumn("month", F.lit("month_2")).write.format("delta").partitionBy("month").mode("append").save(months_gold_path) 399 | 400 | # COMMAND ---------- 401 | 402 | # MAGIC %md 403 | # MAGIC #### ii. Model Training 404 | # MAGIC 405 | # MAGIC Retrain the same model, but this time we are able to use an extra month of data 406 | 407 | # COMMAND ---------- 408 | 409 | # Set the month number - used for naming the MLflow run and tracked as a parameter 410 | month = 2 411 | 412 | # Specify name of MLflow run 413 | run_name = f"month_{month}" 414 | 415 | # Define the parameters to pass in the RandomForestRegressor model 416 | model_params = {"n_estimators": 500, 417 | "max_depth": 5, 418 | "max_features": "log2"} 419 | 420 | # Define a dictionary of parameters that we would like to use during preprocessing 421 | misc_params = {"month": month, 422 | "target_col": target_col, 423 | "cat_cols": cat_cols, 424 | "num_cols": num_cols} 425 | 426 | # COMMAND ---------- 427 | 428 | # Trigger model training and logging to MLflow 429 | month2_run = train_sklearn_rf_model(run_name, 430 | months_gold_path, 431 | model_params, 432 | misc_params) 433 | 434 | 435 | month_2_run_id = month2_run.info.run_id 436 | 437 | # COMMAND ---------- 438 | 439 | # Register model to MLflow Model Registry 440 | month_2_model_version = mlflow.register_model(model_uri=f"runs:/{month_2_run_id}/model", name=mlflow_experiment_name) 441 | 442 | # Transition model to Staging 443 | month_2_model_version = transition_model(month_2_model_version, stage="Staging") 444 | print(month_2_model_version) 445 | 446 | # COMMAND ---------- 447 | 448 | # MAGIC %md 449 | # MAGIC 450 | # MAGIC #### iii. Model checks prior to model deployment 451 | 452 | # COMMAND ---------- 453 | 454 | # Get the original MLflow run associated with the model registered under Staging 455 | current_staging_run = get_run_from_registered_model(mlflow_experiment_name, stage="Staging") 456 | 457 | metric_to_check = "r2_score_X_test" 458 | compare_model_perfs(current_staging_run, current_prod_run, min_model_r2_threshold, metric_to_check) 459 | 460 | # COMMAND ---------- 461 | 462 | month_2_model_version = transition_model(month_2_model_version, stage="Production") 463 | print(month_2_model_version) 464 | 465 | # COMMAND ---------- 466 | 467 | # MAGIC %md 468 | # MAGIC 469 | # MAGIC ### Month 3 - New Data Arrives 470 | # MAGIC 471 | # MAGIC We have had a model in production for 2 months now and have now obtained an additional month of data. 472 | # MAGIC 473 | # MAGIC **Scenario 2:** 474 | # MAGIC * A product campaign went viral on social media. Sales increased by 30% for each day. 475 | # MAGIC 476 | # MAGIC **What are we simulating here?** 477 | # MAGIC * Label drift 478 | # MAGIC * Concept drift 479 | # MAGIC * The underlying relationship between the features and label has changed due to a viral marketing campaign. 480 | 481 | # COMMAND ---------- 482 | 483 | # MAGIC %md 484 | # MAGIC #### i. Feature checks prior to model training 485 | # MAGIC 486 | # MAGIC **All features** 487 | # MAGIC * Null checks 488 | # MAGIC 489 | # MAGIC **Numeric features** 490 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum 491 | # MAGIC * Distribution checks 492 | # MAGIC 493 | # MAGIC **Categorical features** 494 | # MAGIC * Check expected count for each level 495 | # MAGIC * Check the mode 496 | 497 | # COMMAND ---------- 498 | 499 | # Incoming Month 1 Data 500 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True) 501 | 502 | # Filter the DataFrame to only include data for January 2023 503 | raw_data_month3 = raw_data.filter(raw_data["Date"].between("2023-03-01", "2023-03-31")) 504 | 505 | # Print the filtered DataFrame 506 | raw_data_month3.show(5) 507 | 508 | # COMMAND ---------- 509 | 510 | # Compute summary statistics on new incoming data 511 | # we will keep only the columns that we monitored for the last mode training data 512 | # convert to pandas dataframe should be used with care as if the size of data is larger than what can fit on driver node then this can cause failures. 513 | # In the case of data size being large use proper sampling technique to estimate population summary statistics. 514 | month_3_pdf = raw_data_month3.toPandas().drop(['Date'], axis=1) 515 | summary_stats_month_3_pdf = calculate_summary_stats(month_3_pdf) 516 | summary_stats_month_3_pdf 517 | 518 | # COMMAND ---------- 519 | 520 | # Get the current MLflow run associated with the model registered under Production 521 | current_prod_run_2 = get_run_from_registered_model(mlflow_experiment_name, stage="Production") 522 | 523 | # Load in original versions of Delta table used at training time for current Production model 524 | current_prod_pdf_2 = load_delta_table_from_run(current_prod_run_2).toPandas() 525 | 526 | # Load summary statistics pandas DataFrame for data which the model currently in Production was trained and evaluated against 527 | current_prod_stats_pdf_2 = load_summary_stats_pdf_from_run(current_prod_run_2, project_local_tmp_dir) 528 | 529 | # COMMAND ---------- 530 | 531 | # MAGIC %md 532 | # MAGIC 533 | # MAGIC **All features** 534 | # MAGIC * Null checks 535 | 536 | # COMMAND ---------- 537 | 538 | print("\nCHECKING PROPORTION OF NULLS.....") 539 | check_null_proportion(month_3_pdf, null_proportion_threshold=.5) 540 | 541 | # COMMAND ---------- 542 | 543 | # MAGIC %md 544 | # MAGIC **Numeric features** 545 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum 546 | # MAGIC * Distribution checks 547 | 548 | # COMMAND ---------- 549 | 550 | unique_feature_diff_array_month_3 = check_diff_in_summary_stats(summary_stats_month_3_pdf, 551 | current_prod_stats_pdf_2, 552 | num_cols + [target_col], 553 | stats_threshold_limit, 554 | statistic_list) 555 | 556 | unique_feature_diff_array_month_3 557 | 558 | # COMMAND ---------- 559 | 560 | print(f"Let's look at the box plots of the features that exceed the stats_threshold_limit of {stats_threshold_limit}") 561 | plot_boxplots(unique_feature_diff_array_month_3, current_prod_pdf_2, month_3_pdf) 562 | 563 | # COMMAND ---------- 564 | 565 | print("\nCHECKING VARIANCES WITH LEVENE TEST.....") 566 | check_diff_in_variances(current_prod_pdf_2, month_3_pdf, num_cols, p_threshold) 567 | 568 | print("\nCHECKING KS TEST.....") 569 | check_dist_ks_bonferroni_test(current_prod_pdf_2, month_3_pdf, num_cols + [target_col], p_threshold) 570 | 571 | # COMMAND ---------- 572 | 573 | # MAGIC %md 574 | # MAGIC 575 | # MAGIC **Categorical features** 576 | # MAGIC * Check expected count for each level 577 | # MAGIC * Check the mode 578 | 579 | # COMMAND ---------- 580 | 581 | check_categorical_diffs(current_prod_pdf_2, month_3_pdf, cat_cols, p_threshold) 582 | 583 | # COMMAND ---------- 584 | 585 | # MAGIC %md 586 | # MAGIC 587 | # MAGIC **`Action`: Include new data with label drift in training** 588 | # MAGIC 589 | # MAGIC We observe that our label has drifted, and after analysis observe that this most recent month of data was captured during a spike in sales caused by a viral marketing campaign. As such, we will retrain our model and include this recent month of data during training. 590 | 591 | # COMMAND ---------- 592 | 593 | # Append the new month of data (where listings are most expensive across the board) 594 | raw_data_month3.withColumn("month", F.lit("month_3")).write.format("delta").partitionBy("month").mode("append").save(months_gold_path) 595 | 596 | # COMMAND ---------- 597 | 598 | # MAGIC %md 599 | # MAGIC #### ii. Model Training 600 | # MAGIC 601 | # MAGIC Retrain the same model from previous months, including the additional month of data where the label has drifted. 602 | 603 | # COMMAND ---------- 604 | 605 | # Set the month number - used for naming the MLflow run and tracked as a parameter 606 | month = 3 607 | 608 | # Specify name of MLflow run 609 | run_name = f"month_{month}" 610 | 611 | # Define the parameters to pass in the RandomForestRegressor model 612 | model_params = {"n_estimators": 500, 613 | "max_depth": 5, 614 | "max_features": "log2"} 615 | 616 | # Define a dictionary of parameters that we would like to use during preprocessing 617 | misc_params = {"month": month, 618 | "target_col": target_col, 619 | "cat_cols": cat_cols, 620 | "num_cols": num_cols} 621 | 622 | # COMMAND ---------- 623 | 624 | # Trigger model training and logging to MLflow 625 | month3_run = train_sklearn_rf_model(run_name, 626 | months_gold_path, 627 | model_params, 628 | misc_params) 629 | 630 | 631 | month_3_run_id = month3_run.info.run_id 632 | 633 | # COMMAND ---------- 634 | 635 | # Register model to MLflow Model Registry 636 | month_3_model_version = mlflow.register_model(model_uri=f"runs:/{month_3_run_id}/model", name=mlflow_experiment_name) 637 | 638 | # Transition model to Staging 639 | month_3_model_version = transition_model(month_3_model_version, stage="Staging") 640 | print(month_3_model_version) 641 | 642 | # COMMAND ---------- 643 | 644 | # MAGIC %md 645 | # MAGIC 646 | # MAGIC #### iii. Model checks prior to model deployment 647 | 648 | # COMMAND ---------- 649 | 650 | # Get the MLflow run associated with the model currently registered in Staging 651 | current_staging_run_2 = get_run_from_registered_model(mlflow_experiment_name, stage="Staging") 652 | 653 | metric_to_check = "r2_score_X_test" 654 | compare_model_perfs(current_staging_run_2, current_prod_run_2, min_model_r2_threshold, metric_to_check) 655 | 656 | # COMMAND ---------- 657 | 658 | # MAGIC %md 659 | # MAGIC 660 | # MAGIC In this case we note that the new candidate model in Staging performs notably worse than the current model in Production. We know from our checks prior to training that the label has drifted, and that this was due to new listing prices being recorded during vacation season. At this point we would want to prevent a migration of the new candidate model directly to Production and instead investigate if there is any way we can improve model performance. This could involve tuning the hyperparameters of our model, or additionally investigating the inclusion of additional features such as "month of the year" which could allow us to capture temporal impacts to listing prices. 661 | --------------------------------------------------------------------------------