├── Chapter-08
    ├── mlflow-util.py
    ├── automated-testing.py
    └── scheduling-workflow-for-model-retraining.py
├── LICENSE
├── Chapter-09
    ├── config
    │   └── setup.py
    ├── data
    │   └── datagen.py
    ├── util
    │   ├── training.py
    │   └── monitoring.py
    └── model-drift.py
├── Chapter-07
    ├── custom-python-libraries.py
    ├── custom-model.py
    ├── real-time.py
    ├── batch-and-streaming.py
    └── real-time-additional.py
├── Chapter-04
    ├── mlflow-without-featurestore.py
    └── mlflow-with-featurestore.py
├── README.md
├── Chapter-03
    └── churn-analysis.py
└── Chapter-06
    └── model-registry-and-webhooks.py


/Chapter-08/mlflow-util.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import mlflow
 3 | from mlflow.utils.rest_utils import http_request
 4 | import json
 5 | 
 6 | def client():
 7 |     return mlflow.tracking.client.MlflowClient()
 8 | 
 9 | host_creds = client()._tracking_client.store.get_host_creds()
10 | host = host_creds.host
11 | token = host_creds.token
12 | 
13 | def mlflow_endpoint(endpoint, method, body='{}'):
14 |     if method == 'GET':
15 |         response = http_request(
16 |             host_creds=host_creds, endpoint="/api/2.0/mlflow/{}".format(endpoint), method=method, params=json.loads(body))
17 |     else:
18 |         response = http_request(
19 |             host_creds=host_creds, endpoint="/api/2.0/mlflow/{}".format(endpoint), method=method, json=json.loads(body))
20 |     return response.json()
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter-09/config/setup.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC #### Model Drift monitoring on Databricks
 4 | # MAGIC
 5 | # MAGIC **Requirements**
 6 | # MAGIC * The following notebook was developed and tested using [DBR 13.3 LTS ML](https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html)
 7 | # MAGIC
 8 | # MAGIC **Authors**
 9 | # MAGIC - Debu Sinha | debusinha2009@gmail.com / debu.sinha@databricks.com
10 | 
11 | # COMMAND ----------
12 | 
13 | # MAGIC %md
14 | # MAGIC #1) Setup
15 | 
16 | # COMMAND ----------
17 | 
18 | #import mlflow if exists else install notebook scoped libraries
19 | try:
20 |     import mlflow
21 | except Exception as e:
22 |     %pip install mlflow    
23 | 
24 | # COMMAND ----------
25 | 
26 | # Get Databricks workspace username
27 | username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply("user")
28 | print(username)
29 | 
30 | # COMMAND ----------
31 | 
32 | # MAGIC %md
33 | # MAGIC ## 1.1)  Setup Directory structure to store this demo related artifacts
34 | 
35 | # COMMAND ----------
36 | 
37 | # Set home directory for our project
38 | project_home_dir = f"/Users/{username}/model_drift/"
39 | 
40 | #set location for temporary files created in this module
41 | project_local_tmp_dir = f"/dbfs{project_home_dir}tmp/"
42 | 
43 | #this is where we will store raw data in csv format
44 | raw_good_data_path= f"{project_home_dir}data/raw/good"
45 | 
46 | #this is location where data for showcasing scenario 1 for feature drift and bug in the the upstream data processing
47 | raw_month2_bad_data_path = f"{project_home_dir}data/raw/bad"
48 | 
49 | #this is location for delta table where we will store the gold dataset
50 | months_gold_path = f"{project_home_dir}delta/gold"
51 | 
52 | dbutils.fs.rm(project_home_dir, True)
53 | dbutils.fs.rm(project_local_tmp_dir, True)
54 | 
55 | #reset folders for data storage
56 | for path in [raw_good_data_path, raw_month2_bad_data_path, months_gold_path]:
57 |     print(f"creating {path}")
58 |     dbutils.fs.mkdirs(path)
59 | 
60 | # COMMAND ----------
61 | 
62 | # MAGIC %fs
63 | # MAGIC ls /Users/debu.sinha@databricks.com/model_drift/data/
64 | 
65 | # COMMAND ----------
66 | 
67 | # MAGIC %md
68 | # MAGIC ## 1.2) MLflow experiment setup
69 | 
70 | # COMMAND ----------
71 | 
72 | mlflow_experiment_name = "sales_prediction"
73 | 
74 | #this has to be an absolute path in the databricks workspace.
75 | mlflow_experiment_path = f"/Users/{username}/{mlflow_experiment_name}"
76 | 
77 | # COMMAND ----------
78 | 
79 | import mlflow
80 | 
81 | # We need to get the exact path of experiment
82 | experiment = mlflow.get_experiment_by_name(mlflow_experiment_path)
83 | 
84 | if experiment:
85 |     experiment_id = experiment.experiment_id
86 |     mlflow.delete_experiment(experiment_id)
87 |     print(f"Experiment {mlflow_experiment_name} deleted successfully.")
88 |     
89 | # Create a new experiment with the specified name
90 | experiment_id = mlflow.create_experiment(mlflow_experiment_path)
91 | print(f"Experiment {mlflow_experiment_path} created successfully with ID {experiment_id}.")
92 | 
93 | #set the experment for this module
94 | mlflow.set_experiment(mlflow_experiment_path)
95 | 


--------------------------------------------------------------------------------
/Chapter-07/custom-python-libraries.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC ## Author
 4 | # MAGIC
 5 | # MAGIC - **Debu Sinha**
 6 | # MAGIC
 7 | # MAGIC ## Tested Environment
 8 | # MAGIC
 9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
12 | # MAGIC
13 | # MAGIC ## Cluster Setup Instructions
14 | # MAGIC
15 | # MAGIC 1. **Create a Cluster**: 
16 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
17 | # MAGIC     - Under `Policy`, select `Unrestricted`.
18 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
19 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
20 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
21 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
22 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
23 | # MAGIC
24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
25 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
26 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
27 | 
28 | # COMMAND ----------
29 | 
30 | # Import necessary libraries and modules
31 | import mlflow
32 | import mlflow.sklearn
33 | from sklearn.datasets import load_iris
34 | from sklearn.ensemble import RandomForestClassifier
35 | from pandas import DataFrame
36 | from mlflow.models import infer_signature
37 | from mlflow.models.utils import add_libraries_to_model
38 | 
39 | # Initialize the MLflow run
40 | with mlflow.start_run() as run:
41 |     # Load the Iris dataset
42 |     iris_data = load_iris()
43 |     training_data = DataFrame(data=iris_data.data, columns=iris_data.feature_names)
44 |     
45 |     # Initialize and train the RandomForest Classifier
46 |     random_forest_model = RandomForestClassifier(max_depth=7, random_state=42)
47 |     random_forest_model.fit(training_data, iris_data.target)
48 |     
49 |     # Infer model signature for later use
50 |     model_signature = infer_signature(training_data, random_forest_model.predict(training_data))
51 |     
52 |     # Log the trained model to MLflow
53 |     mlflow.sklearn.log_model(random_forest_model, "iris_classifier",
54 |                              signature=model_signature, 
55 |                              registered_model_name="enhanced_model_with_libraries")
56 | 
57 | # Model URI for accessing the registered model
58 | access_model_uri = "models:/enhanced_model_with_libraries/1"
59 | 
60 | # Add libraries to the original model run
61 | add_libraries_to_model(access_model_uri)
62 | 
63 | # Example to add libraries to an existing run
64 | # prev_run_id = "some_existing_run_id"
65 | # add_libraries_to_model(access_model_uri, run_id=prev_run_id)
66 | 
67 | 
68 | # Example to add libraries to a new run
69 | with mlflow.start_run():
70 |     add_libraries_to_model(access_model_uri)
71 | 
72 | # Example to add libraries and register under a new model name
73 | with mlflow.start_run():
74 |     add_libraries_to_model(access_model_uri, registered_model_name="new_enhanced_model")
75 | 


--------------------------------------------------------------------------------
/Chapter-08/automated-testing.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC Load the model name. The **`event_message`** is automatically populated by the webhook.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | 
  8 | import json
  9 |  
 10 | event_message = dbutils.widgets.get("event_message")
 11 | event_message_dict = json.loads(event_message)
 12 | model_name = event_message_dict.get("model_name")
 13 | 
 14 | print(event_message_dict)
 15 | print(model_name)
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # MAGIC  %md Use the model name to get the latest model version.
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | # MAGIC %run ./mlflow-util
 24 | 
 25 | # COMMAND ----------
 26 | 
 27 | from mlflow.tracking import MlflowClient
 28 | client = MlflowClient()
 29 | 
 30 | version = event_message_dict.get("version")
 31 | version
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | # MAGIC %md Use the model name and version to load a **`pyfunc`** model of our model in staging environment.
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | import mlflow
 40 | 
 41 | pyfunc_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{version}")
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | # MAGIC %md Get the input schema of our logged model.
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | input_schema = pyfunc_model.metadata.get_input_schema().as_spark_schema()
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | # MAGIC %md Here we define our expected input schema.
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | from pyspark.sql.types import StringType, StructField, IntegerType, DoubleType, StructType
 58 | 
 59 | expected_input_schema = (StructType([
 60 |     StructField("CreditScore", IntegerType(), True),
 61 |     StructField("Geography", StringType(), True),
 62 |     StructField("Gender", StringType(), True),
 63 |     StructField("Age", IntegerType(), True),
 64 |     StructField("Tenure", IntegerType(), True),
 65 |     StructField("Balance", DoubleType(), True),
 66 |     StructField("NumOfProducts", IntegerType(), True),
 67 |     StructField("HasCrCard", IntegerType(), True),
 68 |     StructField("isActiveMember", IntegerType(), True),
 69 |     StructField("EstimatedSalary", DoubleType(), True)
 70 | ]))
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | if expected_input_schema.fields.sort(key=lambda x: x.name) != input_schema.fields.sort(key=lambda x: x.name):
 75 |     comment = "This model failed input schema check"
 76 |     comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment}
 77 |     mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body))
 78 |     raise Exception("Input Schema mismatched")
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | # MAGIC %md Load the dataset and generate some predictions to ensure our model is working correctly. 
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | import pandas as pd
 87 | 
 88 | sample_data = spark.table("bank_churn_analysis.raw_data")
 89 | #read the raw dataset provided with the code base
 90 | df = sample_data.toPandas()
 91 | 
 92 | #exclude the columns that are not used for prediction
 93 | excluded_columns = {"RowNumber", "CustomerId", "Surname"}
 94 | df_input = df[[col for col in df.columns if col not in excluded_columns]]
 95 | 
 96 | df_input.head()
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | predictions = pyfunc_model.predict(df_input)
101 | 
102 | # COMMAND ----------
103 | 
104 | # MAGIC %md Make sure our prediction types are correct.
105 | 
106 | # COMMAND ----------
107 | 
108 | import numpy as np
109 | 
110 | if type(predictions) != np.ndarray or type(predictions[0]) != np.int32:
111 |     comment = "This model prediction check"
112 |     comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment}
113 |     mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body))
114 |     raise Exception("Prediction Datatype is not as expected")
115 | 
116 | # COMMAND ----------
117 | 
118 | # Leave a comment for the ML engineer who will be reviewing the tests
119 | comment = "This model passed all the tests"
120 | comment_body = {'name': model_name, 'version': version, 'comment': comment}
121 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body))
122 | 


--------------------------------------------------------------------------------
/Chapter-08/scheduling-workflow-for-model-retraining.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Training Workflow
  4 | # MAGIC
  5 | # MAGIC In this notebook, we'll create a workflow to retrain our model. Then, we'll set up this notebook to run monthly using a Databricks Job to ensure our model is always up-to-date.
  6 | # MAGIC
  7 | # MAGIC ### Load Features
  8 | # MAGIC
  9 | # MAGIC First, we'll load in our feature table which in this case is the original raw dataset.
 10 | # MAGIC
 11 | # MAGIC
 12 | # MAGIC In the case of this demonstration, these are the same records &mdash; but in real-world scenario, we'd likely have updated records appended to this table each time the model is trained.
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %pip install databricks-registry-webhooks
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | database_name = "bank_churn_analysis"
 21 | 
 22 | #we will exclude the same columns that we did earlier while training our model using AutoML from UI.
 23 | excluded_featured_from_raw = {"RowNumber", "CustomerId", "Surname"}
 24 | target_column = "Exited"
 25 | 
 26 | new_data = spark.table(f"{database_name}.raw_data")
 27 | features = [c for c in new_data.columns if c not in excluded_featured_from_raw]
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %md 
 32 | # MAGIC ## Add webhook for kicking off automated testing job
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | # get token from notebook
 37 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
 38 | 
 39 | #create authorization header for REST calls
 40 | headers = {"Authorization": f"Bearer {token}"}
 41 | 
 42 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection
 43 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()
 44 | 
 45 | # This object comes from the Java CM
 46 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags)
 47 | 
 48 | # extract the databricks instance (domain name) from the dictionary
 49 | instance = tags["browserHostName"]
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | model_name = "Churn Prediction Bank"
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | from databricks_registry_webhooks import RegistryWebhooksClient, JobSpec
 58 | 
 59 | job_spec = JobSpec(
 60 |   job_id="295266394513960",
 61 |   workspace_url="https://"+instance,
 62 |   access_token=token
 63 | )
 64 | 
 65 | job_webhook = RegistryWebhooksClient().create_webhook(
 66 |   model_name=model_name,
 67 |   events=["TRANSITION_REQUEST_TO_STAGING_CREATED"],
 68 |   job_spec=job_spec,
 69 |   description="Registering webhook to automate testing of a new candidate model for staging"
 70 | )
 71 | 
 72 | job_webhook
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # Test the Job webhook
 77 | # RegistryWebhooksClient().test_webhook(id=job_webhook.id)
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # MAGIC %md
 82 | # MAGIC ### AutoML Process
 83 | # MAGIC
 84 | # MAGIC Next, we'll use the AutoML API to kick off an AutoML classification experiment. This is similar to what we did with the AutoML UI, but we can use the API to automate this process.
 85 | 
 86 | # COMMAND ----------
 87 | 
 88 | import databricks.automl
 89 | model = databricks.automl.classify(
 90 |     new_data.select(features), 
 91 |     target_col=target_column,
 92 |     primary_metric="f1",
 93 |     timeout_minutes=5,
 94 |     max_trials=30,
 95 | ) 
 96 | 
 97 | # COMMAND ----------
 98 | 
 99 | #information about the latest automl model training
100 | help(model)
101 | 
102 | # COMMAND ----------
103 | 
104 | # MAGIC %md
105 | # MAGIC ### Register the Best Model
106 | # MAGIC
107 | # MAGIC Once the AutoML experiment is done, we can identify the best model from the experiment and register that model to the Model Registry.
108 | 
109 | # COMMAND ----------
110 | 
111 | import mlflow
112 | from mlflow.tracking.client import MlflowClient
113 | 
114 | client = MlflowClient()
115 | 
116 | run_id = model.best_trial.mlflow_run_id
117 | 
118 | model_uri = f"runs:/{run_id}/model"
119 | 
120 | model_details = mlflow.register_model(model_uri, model_name)
121 | 
122 | # COMMAND ----------
123 | 
124 | # MAGIC %md
125 | # MAGIC ### Request model Transition to Staging
126 | # MAGIC
127 | # MAGIC Once the model is registered, we request that it be transitioned to the **Staging** stage for testing.
128 | # MAGIC
129 | # MAGIC First, we'll includ a helper function to interact with the MLflow registry API. In your production environment its always a good practice to modularize your code for maintainability.
130 | 
131 | # COMMAND ----------
132 | 
133 | # MAGIC %run ./mlflow-util
134 | 
135 | # COMMAND ----------
136 | 
137 | # MAGIC %md
138 | # MAGIC Next, we'll set up the transition request using the `mlflow_endpoint` operation from the helpers notebook.
139 | 
140 | # COMMAND ----------
141 | 
142 | staging_request = {'name': model_name, 'version': model_details.version, 'stage': 'Staging', 'archive_existing_versions': 'false'}
143 | mlflow_endpoint('transition-requests/create', 'POST', json.dumps(staging_request))
144 | 
145 | # COMMAND ----------
146 | 
147 | # MAGIC %md
148 | # MAGIC And we'll add a comment to the version of the model that we just requested be moved to **Staging** to let the machine learning engineer know why we are making the request.
149 | 
150 | # COMMAND ----------
151 | 
152 | # Leave a comment for the ML engineer who will be reviewing the tests
153 | comment = "This was the best model from the most recent AutoML run. Ready for testing"
154 | comment_body = {'name': model_name, 'version': model_details.version, 'comment': comment}
155 | mlflow_endpoint('comments/create', 'POST', json.dumps(comment_body))
156 | 
157 | # COMMAND ----------
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/Chapter-07/custom-model.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Author
  4 | # MAGIC
  5 | # MAGIC - **Debu Sinha**
  6 | # MAGIC
  7 | # MAGIC ## Tested Environment
  8 | # MAGIC
  9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 12 | # MAGIC
 13 | # MAGIC ## Cluster Setup Instructions
 14 | # MAGIC
 15 | # MAGIC 1. **Create a Cluster**: 
 16 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 17 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 18 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 19 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 20 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 21 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 22 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 23 | # MAGIC
 24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 25 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 26 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 27 | # MAGIC
 28 | # MAGIC ## Predicting Wine Cultivars using Decision Tree Classifier and MLflow
 29 | # MAGIC
 30 | # MAGIC This code is designed to solve a multi-class classification problem using the wine dataset. The wine dataset contains 178 samples, each belonging to one of three different cultivars (types of grape) in Italy. Each sample has 13 different features like Alcohol, Malic acid, etc.
 31 | # MAGIC
 32 | # MAGIC ### Objective
 33 | # MAGIC
 34 | # MAGIC The objective of the model is to predict the cultivar to which a given wine sample belongs based on its 13 features. In simpler terms, for a new wine sample, the model aims to categorize it as 'class_0', 'class_1', or 'class_2', representing one of the three possible cultivars. Additionally, the model provides the probabilities for the sample belonging to each of these classes.
 35 | # MAGIC
 36 | # MAGIC ### Implementation
 37 | # MAGIC
 38 | # MAGIC The code uses a Decision Tree classifier and trains it on a subset of the wine dataset, known as the training set. After training, the model is encapsulated in a custom Python class (`CustomModelWrapper`). This class facilitates the logging of the model using MLflow, a platform for end-to-end machine learning lifecycle management.
 39 | # MAGIC
 40 | # MAGIC Once the model is logged, it can be deployed and used to make predictions on new, unseen data, commonly referred to as the test set.
 41 | 
 42 | # COMMAND ----------
 43 | 
 44 | from sklearn.datasets import load_wine
 45 | from sklearn.model_selection import train_test_split
 46 | from sklearn.tree import DecisionTreeClassifier
 47 | from mlflow.models.signature import ModelSignature
 48 | from mlflow.types.schema import Schema, ColSpec
 49 | import mlflow
 50 | import mlflow.pyfunc
 51 | import pandas as pd
 52 | 
 53 | # Custom model class
 54 | class CustomModelWrapper(mlflow.pyfunc.PythonModel):
 55 |     # Initialize the classifier model in the constructor
 56 |     def __init__(self, classifier_model):
 57 |         self.classifier_model = classifier_model
 58 | 
 59 |     # Prediction method
 60 |     def predict(self, context, model_data):
 61 |         # Compute the probabilities and the classes
 62 |         probs = self.classifier_model.predict_proba(model_data)
 63 |         preds = self.classifier_model.predict(model_data)
 64 |         
 65 |         # Create a DataFrame to hold probabilities and predictions
 66 |         labels = ["class_0", "class_1", "class_2"]
 67 |         result_df = pd.DataFrame(probs, columns=[f'prob_{label}' for label in labels])
 68 |         result_df['prediction'] = [labels[i] for i in preds]
 69 |         
 70 |         return result_df
 71 | 
 72 | # Load the wine dataset and split it into training and test sets
 73 | wine_data = load_wine()
 74 | X, y = wine_data.data, wine_data.target
 75 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
 76 | 
 77 | # Initialize and fit the DecisionTreeClassifier
 78 | dt_classifier = DecisionTreeClassifier(random_state=7)
 79 | dt_classifier.fit(X_train, y_train)
 80 | 
 81 | # Create an instance of the CustomModelWrapper
 82 | custom_wrapper = CustomModelWrapper(dt_classifier)
 83 | 
 84 | # Define the input and output schema
 85 | input_cols = [ColSpec("double", feature) for feature in wine_data.feature_names]
 86 | output_cols = [ColSpec("double", f'prob_{cls}') for cls in wine_data.target_names] + [ColSpec("string", 'prediction')]
 87 | model_sign = ModelSignature(inputs=Schema(input_cols), outputs=Schema(output_cols))
 88 | 
 89 | # Prepare an example input
 90 | input_sample = pd.DataFrame(X_train[:1], columns=wine_data.feature_names)
 91 | input_sample_dict = input_sample.to_dict(orient='list')
 92 | 
 93 | # Log the model using MLflow
 94 | with mlflow.start_run():
 95 |     mlflow.pyfunc.log_model("wine_model",
 96 |                             python_model=custom_wrapper,
 97 |                             input_example=input_sample_dict,
 98 |                             signature=model_sign)
 99 | 
100 | # Retrieve the run ID and load the logged model
101 | last_run_id = mlflow.last_active_run().info.run_id
102 | retrieved_model = mlflow.pyfunc.load_model(f"runs:/{last_run_id}/wine_model")
103 | 
104 | # Create a DataFrame for the test data
105 | test_df = pd.DataFrame(X_test[:1], columns=wine_data.feature_names)
106 | 
107 | # Use the loaded model for prediction
108 | prediction_result = retrieved_model.predict(test_df)
109 | 
110 | 
111 | # COMMAND ----------
112 | 
113 | prediction_result
114 | 
115 | # COMMAND ----------
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/Chapter-04/mlflow-without-featurestore.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md # MLflow introduction.
  3 | # MAGIC
  4 | # MAGIC This tutorial covers an example of how to use the integrated MLflow tracking capabilities to track your model training with the integrated feature store.
  5 | # MAGIC   - Import data from the Delta table that contains feature engineered datasets.
  6 | # MAGIC   - Create a baseline model for churn prediction and store it in the integrated MLflow tracking server. 
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | # MAGIC %md
 11 | # MAGIC ###0. SETUP -- Databricks Spark cluster:  
 12 | # MAGIC
 13 | # MAGIC 1. **Create** a cluster by...  
 14 | # MAGIC   - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 
 15 | # MAGIC   - In `Policy` select `Unrestricted`.
 16 | # MAGIC   - Enter any text, i.e `demo` into the cluster name text box.
 17 | # MAGIC   - Select `Single Node` in the cluster mode.
 18 | # MAGIC   - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab.
 19 | # MAGIC   - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 20 | # MAGIC   - Click the `create cluster` button and wait for your cluster to be provisioned
 21 | # MAGIC 3. **Attach** this notebook to your cluster by...   
 22 | # MAGIC   - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | #install latest version of sklearn
 27 | %pip install -U scikit-learn
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %md 
 32 | # MAGIC ### Step 1) Importing the desired libraries and defining few constants.
 33 | # MAGIC
 34 | # MAGIC - Note:<br>
 35 | # MAGIC   - In this example the feature table is the same as we created in Chapter 3, however we will not use the featurestore API to access the data in the feature table.<br>
 36 | # MAGIC   - As explained in chapter 3, all the offline feature tables are backed as Delta tables and are searchable through the integrated Hive metastore in Databricks. This allows us to read these tables like a regular external or managed table.
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | from databricks.feature_store import FeatureStoreClient
 41 | from databricks.feature_store import FeatureLookup
 42 | import typing
 43 | 
 44 | from sklearn import metrics
 45 | from sklearn.ensemble import RandomForestClassifier
 46 | from sklearn.model_selection import train_test_split
 47 | import mlflow
 48 | import pandas as pd
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | #Name of experiment where we will track all the different model training runs.
 53 | EXPERIMENT_NAME = "Bank_Customer_Churn_Analysis"
 54 | #Name of the model
 55 | MODEL_NAME = "random_forest_classifier"
 56 | #This is the name for the entry in model registry
 57 | MODEL_REGISTRY_NAME = "Bank_Customer_Churn"
 58 | #The email you use to authenticate in the Databricks workspace
 59 | USER_EMAIL = "debu.sinha@databricks.com"
 60 | #Location where the MLflow experiement will be listed in user workspace
 61 | EXPERIMENT_NAME = f"/Users/{USER_EMAIL}/{EXPERIMENT_NAME}"
 62 | # we have all the features backed into a Delta table so we will read directly
 63 | FEATURE_TABLE = "bank_churn_analysis.bank_customer_features"
 64 | 
 65 | # COMMAND ----------
 66 | 
 67 | # MAGIC %md
 68 | # MAGIC ### Step 2) Build a simplistic model that uses the feature store table as its source for training and validation.
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | # set experiment name
 73 | mlflow.set_experiment(EXPERIMENT_NAME)
 74 | 
 75 | with mlflow.start_run():  
 76 |   TEST_SIZE = 0.20
 77 |   
 78 |   # Now we will read the data directly from the feature table
 79 |   training_df = spark.table(FEATURE_TABLE)
 80 |   
 81 |   # convert the dataset to pandas so that we can fit sklearn RandomForestClassifier on it
 82 |   train_df = training_df.toPandas()
 83 |   
 84 |   # The train_df represents the input dataframe that has all the feature columns along with the new raw input in the form of training_df.
 85 |   X = train_df.drop(['Exited'], axis=1)
 86 |   y = train_df['Exited']
 87 |   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=54, stratify=y)
 88 |   
 89 |   # here we will are not doing any hyperparameter tuning however, in future we will see how to perform hyperparameter tuning in scalable manner on Databricks.
 90 |   model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
 91 |   signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train))
 92 |   
 93 |   predictions = model.predict(X_test)
 94 |   fpr, tpr, _ = metrics.roc_curve(y_test, predictions, pos_label=1)
 95 |   auc = metrics.auc(fpr, tpr)
 96 |   accuracy = metrics.accuracy_score(y_test, predictions)
 97 |  
 98 |   # get the calculated feature importances.
 99 |   importances = dict(zip(model.feature_names_in_, model.feature_importances_))  
100 |   # log artifact
101 |   mlflow.log_dict(importances, "feature_importances.json")
102 |   # log metrics
103 |   mlflow.log_metric("auc", auc)
104 |   mlflow.log_metric("accuracy", accuracy)
105 |   # log parameters
106 |   mlflow.log_param("split_size", TEST_SIZE)
107 |   mlflow.log_params(model.get_params())
108 |   # set tag
109 |   mlflow.set_tag(MODEL_NAME, "mlflow demo")
110 |   # log the model itself in mlflow tracking server
111 |   mlflow.sklearn.log_model(model, MODEL_NAME, signature=signature, input_example=X_train.iloc[:4, :])
112 | 
113 | # COMMAND ----------
114 | 
115 | from mlflow.tracking import MlflowClient
116 | #initialize the mlflow client
117 | client = MlflowClient()
118 | #get the experiment id 
119 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
120 | #get the latest run id which will allow us to directly access the metrics, and attributes and all th einfo
121 | run_id = mlflow.search_runs(experiment_id, order_by=["start_time DESC"]).head(1)["run_id"].values[0]
122 | #now we will register the latest model into the model registry
123 | new_model_version = mlflow.register_model(f"runs:/{run_id}/{MODEL_NAME}", MODEL_REGISTRY_NAME)
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center"><a href="https://packt.link/mlsumgh"><img src="https://static.packt-cdn.com/assets/images/ML Summit Banner v3 1200x627.png" alt="Machine Learning Summit 2025"/></a></p>
  2 | 
  3 | ## Machine Learning Summit 2025
  4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges**
  5 | 
  6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of:
  7 | - **Agentic and Generative AI**
  8 | - **Applied Machine Learning in the Real World**
  9 | - **ML Engineering and Optimization**
 10 | 
 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh)
 12 | 
 13 | ---
 14 | 
 15 | ## Join Our Newsletters 📬
 16 | 
 17 | ### DataPro  
 18 | *The future of AI is unfolding. Don’t fall behind.*
 19 | 
 20 | <p><a href="https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/DataPro NL QR Code.png" alt="DataPro QR" width="150"/></a></p>
 21 | 
 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers.  
 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast.
 24 | 
 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat.
 26 | 
 27 | ---
 28 | 
 29 | ### BIPro  
 30 | *Business runs on data. Make sure yours tells the right story.*
 31 | 
 32 | <p><a href="https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/BIPro NL QR Code.png" alt="BIPro QR" width="150"/></a></p>
 33 | 
 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders.  
 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**.
 36 | 
 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing.
 38 | 
 39 | # Practical Machine Learning on Databricks 
 40 | 
 41 | <a href="https://www.amazon.com/Practical-Data-Science-Databricks-end/dp/1801812039?utm_source=github&utm_medium=repository&utm_campaign=9781837631285"><img src="https://m.media-amazon.com/images/I/71RfhLDWM1L._SL1500_.jpg" alt="" height="256px" align="right"></a>
 42 | 
 43 | This is the code repository for [Practical Machine Learning on Databricks](https://www.amazon.com/Practical-Data-Science-Databricks-end/dp/1801812039?utm_source=github&utm_medium=repository&utm_campaign=9781837631285), published by Packt.
 44 | 
 45 | **Seamlessly transition ML models and MLOps on Databricks**
 46 | 
 47 | ## What is this book about?
 48 | Unleash the potential of databricks for end-to-end machine learning with this comprehensive guide, tailored for experienced data scientists and developers transitioning from DIY or other cloud platforms. Building on a strong foundation in Python, Practical Machine Learning on Databricks serves as your roadmap from development to production, covering all intermediary steps using the databricks platform.
 49 | 
 50 | This book covers the following exciting features:
 51 | * Transition smoothly from DIY setups to databricks
 52 | * Master AutoML for quick ML experiment setup
 53 | * Automate model retraining and deployment
 54 | * Leverage databricks feature store for data prep
 55 | * Use MLflow for effective experiment tracking
 56 | * Gain practical insights for scalable ML solutions
 57 | * Find out how to handle model drifts in production environments
 58 | 
 59 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1801812039) today!
 60 | 
 61 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
 62 | alt="https://www.packtpub.com/" border="5" /></a>
 63 | 
 64 | ## Instructions and Navigations
 65 | All of the code is organized into folders. For example, Chapter02.
 66 | 
 67 | The code will look like the following:
 68 | ```
 69 | iris = load_iris()
 70 |  
 71 | X = iris.data # Features
 72 |  
 73 | y = iris.target # Labels
 74 | 
 75 | ```
 76 | 
 77 | **Following is what you need for this book:**
 78 | This book is for experienced data scientists, engineers, and developers proficient in Python, statistics, and ML lifecycle looking to transition to databricks from DIY clouds. Introductory Spark knowledge is a must to make the most out of this book, however, end-to-end ML workflows will be covered. If you aim to accelerate your machine learning workflows and deploy scalable, robust solutions, this book is an indispensable resource.
 79 | 
 80 | With the following software and hardware list you can run all code files present in the book (Chapter 1-10).
 81 | ### Software and Hardware List
 82 | | Chapter | Software required | OS required |
 83 | | -------- | ------------------------------------ | ----------------------------------- |
 84 | | 1-10 | Databricks Runtime | Windows and Mac OS |
 85 | | 1-10 | Python proficiency (3.x) | Windows and Mac OS |
 86 | | 1-10 | Statistics and ML basics | Windows and Mac OS |
 87 | | 1-10 | Spark knowledge (3.0 or above) | Windows and Mac OS |
 88 | | 1-10 | Delta Lake features (optional) | Windows and Mac OS |
 89 | 
 90 | ### Related products
 91 | * Machine Learning for Emotion Analysis in Python [[Packt]](https://www.packtpub.com/product/machine-learning-for-emotion-analysis-in-python/9781803240688?utm_source=github&utm_medium=repository&utm_campaign=9781803240688) [[Amazon]](https://www.amazon.com/dp/1803240687)
 92 | 
 93 | * Machine Learning with LightGBM and Python [[Packt]](https://www.packtpub.com/product/machine-learning-with-lightgbm-and-python/9781800564749?utm_source=github&utm_medium=repository&utm_campaign=9781800564749) [[Amazon]](https://www.amazon.com/dp/1800564740)
 94 | 
 95 | ## Get to Know the Author
 96 | **Debu Sinha**
 97 |  is an experienced data science and engineering leader with deep expertise in software engineering and solutions architecture. With over 10 years in the industry, Debu has a proven track record in designing scalable software applications and big data, and machine learning systems. As lead ML specialist on the Specialist Solutions Architect team at Databricks, Debu focuses on AI/ML use cases in the cloud and serves as an expert on LLMs, ML, and MLOps. With prior experience as a start-up co-founder, Debu has demonstrated skills in team-building, scaling, and delivering impactful software solutions. An established thought leader, Debu has received multiple awards and regularly speaks at industry events.
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/Chapter-07/real-time.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md # 
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | # MAGIC %md
  7 | # MAGIC ## Author
  8 | # MAGIC
  9 | # MAGIC - **Debu Sinha**
 10 | # MAGIC
 11 | # MAGIC ## Tested Environment
 12 | # MAGIC
 13 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 14 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 15 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 16 | # MAGIC
 17 | # MAGIC ## Cluster Setup Instructions
 18 | # MAGIC
 19 | # MAGIC 1. **Create a Cluster**: 
 20 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 21 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 22 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 23 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 24 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 25 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 26 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 27 | # MAGIC
 28 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 29 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 30 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 31 | # MAGIC
 32 | # MAGIC ## Real Time Deployment Options
 33 | # MAGIC
 34 | # MAGIC * **Databricks Integrated Serving Endpoints**: These endpoints offer a comprehensive solution for both prototyping and production deployment of models. They are designed to manage real-time requests through REST APIs. We are going to cover this approach in the notebook.
 35 | # MAGIC
 36 | # MAGIC ### Additional options
 37 | # MAGIC
 38 | # MAGIC  MLflow integrates seamlessly with managed services across various cloud platforms if your intent is to use cloud specific model serving capabilities:
 39 | # MAGIC
 40 | # MAGIC - **Azure ML**: For Microsoft Azure
 41 | # MAGIC - **SageMaker**: For AWS
 42 | # MAGIC - **Vertex AI**: For Google Cloud Platform
 43 | # MAGIC
 44 | # MAGIC ### Custom Deployments
 45 | # MAGIC
 46 | # MAGIC If you're seeking a more custom deployment, you can:
 47 | # MAGIC
 48 | # MAGIC - Export the model from the Model Registry as a Python pickle file.
 49 | # MAGIC - Create your own Flask application to serve the model.
 50 | # MAGIC   
 51 | # MAGIC **Note**: This custom approach often leverages containerization technologies like Docker or orchestration solutions like Kubernetes.
 52 | # MAGIC
 53 | # MAGIC   
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # MAGIC %md
 58 | # MAGIC ### Databricks Serving Endpoint
 59 | # MAGIC We will use the model for our Bank Customer Churn prediction that we enabled serving for through the UI. On the serving page you can find code snippets that show you exactly how to call the deployed model. Here we are going to dynamically generate the URI for the deployed model so that you can execute this code in your workspace without change.
 60 | 
 61 | # COMMAND ----------
 62 | 
 63 | # get token from notebook
 64 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
 65 | 
 66 | #create authorization header for REST calls
 67 | headers = {
 68 |     "Authorization": f"Bearer {token}",
 69 |     "Content-Type": "application/json"
 70 |   }
 71 |  
 72 | 
 73 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection
 74 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()
 75 | 
 76 | # This object comes from the Java CM
 77 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags)
 78 | 
 79 | # extract the databricks instance (domain name) from the dictionary
 80 | instance = tags["browserHostName"]
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | # MAGIC %md Defining a function called `score_model` that will pass JSON string as input to the model and get response back.
 85 | 
 86 | # COMMAND ----------
 87 | 
 88 | # Import the requests library for HTTP communication
 89 | import requests
 90 | 
 91 | #change the model_serving_endpoint_name to the one you have given.
 92 | model_serving_endpoint_name = "churn_prediction"
 93 | 
 94 | # Define the function 'score_model' which takes a dictionary as an input
 95 | def score_model(data_json: str):
 96 |     
 97 |     # Construct the URL for the model serving endpoint
 98 |     url = f"https://{instance}/serving-endpoints/{model_serving_endpoint_name}/invocations"
 99 |     
100 |     # Make an HTTP POST request to score the model
101 |     response = requests.request(method="POST", headers=headers, url=url, data=data_json)
102 |     
103 |     # Check if the request was successful (HTTP status code 200)
104 |     if response.status_code != 200:
105 |         # If not, raise an exception detailing the failure
106 |         raise Exception(f"Request failed with status {response.status_code}, {response.text}")
107 |         
108 |     # Return the JSON response from the model scoring endpoint
109 |     return response.json()
110 | 
111 | # COMMAND ----------
112 | 
113 | #reading a sample of raw data
114 | raw_data_spark_df =  spark.table("bank_churn_analysis.raw_data")
115 | 
116 | input_cols = [col for col in raw_data_spark_df.columns if col not in {'RowNumber', 'CustomerId', 'Surname', 'Exited'}]
117 | 
118 | #drop the columns that will not be send to model as input
119 | raw_data_spark_df = raw_data_spark_df.select(*[input_cols])
120 | 
121 | pandas_df = raw_data_spark_df.toPandas()
122 | #convert to pandas dataframe
123 | 
124 | #lets take 2 sample records to use as input for our serving endpoint
125 | input_examples_df_records = pandas_df[:2]
126 | input_examples_df_records
127 | 
128 | # COMMAND ----------
129 | 
130 | # MAGIC %md
131 | # MAGIC ### DataFrame Records Format
132 | # MAGIC ####Overview
133 | # MAGIC The DataFrame Records format is useful when the data can be readily represented as a Pandas DataFrame. In this approach, the DataFrame is serialized into a list of dictionaries, with each dictionary corresponding to a row in the DataFrame.
134 | # MAGIC
135 | # MAGIC ####Pros and Cons
136 | # MAGIC - __Pros__: This format is easier to read and is more human-friendly.
137 | # MAGIC - __Cons__: It consumes more bandwidth because the column names are repeated for each record.
138 | # MAGIC
139 | # MAGIC #### Use Case
140 | # MAGIC This format is preferable when you need to send DataFrame-like data, and readability is a priority.
141 | 
142 | # COMMAND ----------
143 | 
144 | # Serialize using json
145 | import json
146 | serialized_data = json.dumps({"dataframe_records": input_examples_df_records.to_dict('records')}, indent=4)
147 | print(serialized_data)
148 | score_model(serialized_data)
149 | 
150 | # COMMAND ----------
151 | 
152 | # MAGIC %md
153 | # MAGIC
154 | # MAGIC ### DataFrame Split Format
155 | # MAGIC
156 | # MAGIC #### Overview
157 | # MAGIC
158 | # MAGIC This format represents a Pandas DataFrame in a split orientation, separating the columns, index, and data into different keys. This is a more bandwidth-efficient alternative to the records orientation.
159 | # MAGIC
160 | # MAGIC #### Pros and Cons
161 | # MAGIC
162 | # MAGIC - __Pros__: This format is more bandwidth-efficient as compared to the records orientation.
163 | # MAGIC - __Cons__: It is a bit less intuitive to read.
164 | # MAGIC
165 | # MAGIC #### Use Case
166 | # MAGIC
167 | # MAGIC This format is useful when sending DataFrame-like data, and bandwidth or payload size is a concern.
168 | 
169 | # COMMAND ----------
170 | 
171 | serialized_data = json.dumps({"dataframe_split": input_examples_df_records.to_dict('split')}, indent=4)
172 | print(serialized_data)
173 | score_model(serialized_data)
174 | 


--------------------------------------------------------------------------------
/Chapter-09/data/datagen.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC ### Month 1 - Base line Data
  5 | # MAGIC
  6 | # MAGIC We will generate a dummy dataset for showcasing model drift. The dataset consists of time series data for 3 months. 
  7 | # MAGIC
  8 | # MAGIC The independent features of the dataset include the following features:
  9 | # MAGIC
 10 | # MAGIC **Features**
 11 | # MAGIC * `Temperature` (Numeric) : Highest daily temperature in Fahrenheit. 
 12 | # MAGIC * `Weather_Condition` (Categorical): 'sunny', 'cloudy', 'rainy' 
 13 | # MAGIC * `Promotion_Type` (Categorical): 'discount', 'free_gift', 'bundle_deal'
 14 | # MAGIC * `Website_Traffic` (Numeric): Total website traffic
 15 | # MAGIC * `Device_Type` (Categorical): 
 16 | # MAGIC
 17 | # MAGIC **Target**
 18 | # MAGIC * `Daily_Sales` (Numeric):  
 19 | # MAGIC
 20 | # MAGIC The `Daily_Sales` target will have following correlation with various features"
 21 | # MAGIC * `Positive correlation` with `Temperature` and `Website_Traffic`.
 22 | # MAGIC * `Negative correlation` with `Weather_Condition` and `Device_Type`.
 23 | # MAGIC
 24 | # MAGIC We will train our model on the first month worth of data and then simulate various drift patterns in the consecutive months of data. 
 25 | # MAGIC
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | import numpy as np
 30 | import pandas as pd
 31 | import matplotlib.pyplot as plt
 32 | 
 33 | # Set random seed for reproducibility
 34 | np.random.seed(0)
 35 | 
 36 | # Generate dates for the time series data
 37 | dates = pd.date_range('2023-01-01', '2023-01-31')
 38 | num_days = len(dates)
 39 | # Generate independent feature data
 40 | temperature = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2)
 41 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2])
 42 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3])
 43 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int)  # Generate website traffic as integers
 44 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1])
 45 | 
 46 | # Generate dependent feature data (daily sales)
 47 | # Add positive correlation with temperature and website_traffic
 48 | # Add negative correlation with weather_condition and device_type
 49 | sales = np.round(1000 + 10*temperature + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')).astype(int)
 50 | 
 51 | # Create a pandas DataFrame to store the time series data
 52 | sales_data_month1 = pd.DataFrame({'Date': dates,
 53 |                           'Temperature': temperature,
 54 |                           'Weather_Condition': weather_condition,
 55 |                           'Promotion_Type': promotion_type,
 56 |                           'Website_Traffic': website_traffic,
 57 |                           'Device_Type': device_type,
 58 |                           'Daily_Sales': sales})
 59 | 
 60 | 
 61 | # COMMAND ----------
 62 | 
 63 | # MAGIC %md
 64 | # MAGIC
 65 | # MAGIC ### Month 2 - New Data Arrives
 66 | # MAGIC
 67 | # MAGIC Our model has been deployed for a month and we now have an incoming fresh month of data.
 68 | # MAGIC
 69 | # MAGIC **Scenario:**
 70 | # MAGIC * An updated upstream Data cleaning process has a bug causing the the value of, `website_traffic` counts for promotion type `bundle_deal` and `free_gift` to be empty.  
 71 | # MAGIC
 72 | # MAGIC * Also during the upstream data generation procedure a the temperature values are now being captured in __Fahrenheit__ rather than in __Celcius__.
 73 | # MAGIC   
 74 | # MAGIC **What are we simulating here?**
 75 | # MAGIC * Feature drift
 76 | # MAGIC * Upstream data errors
 77 | 
 78 | # COMMAND ----------
 79 | 
 80 | # Generate dates for the time series data
 81 | dates = pd.date_range('2023-02-01', '2023-02-28')
 82 | num_days = len(dates)
 83 | 
 84 | # introducing feature drift
 85 | # Generate independent feature data
 86 | temperature_celcicus = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2)
 87 | 
 88 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2])
 89 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3])
 90 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int)  # Generate website traffic as integers
 91 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1])
 92 | 
 93 | # Generate dependent feature data (daily sales)
 94 | # Add positive correlation with temperature and website_traffic
 95 | # Add negative correlation with weather_condition and device_type
 96 | sales = np.round(1000 + 10*temperature_celcicus + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')).astype(int)
 97 | 
 98 | # Create a pandas DataFrame to store the time series data
 99 | sales_data_month2_correct = pd.DataFrame({'Date': dates,
100 |                           'Temperature': temperature_celcicus,
101 |                           'Weather_Condition': weather_condition,
102 |                           'Promotion_Type': promotion_type,
103 |                           'Website_Traffic': website_traffic,
104 |                           'Device_Type': device_type,
105 |                           'Daily_Sales': sales})
106 | 
107 | 
108 | #change temperature scale to Fehrenheit
109 | #Convert the Celsius temperatures to Fahrenheit
110 | temperature_fahrenheit = (temperature_celcicus * 9 / 5) + 32
111 | 
112 | 
113 | # Create a pandas DataFrame to store the time series data
114 | sales_data_month2_wrong = pd.DataFrame({'Date': dates,
115 |                           'Temperature': temperature_fahrenheit,
116 |                           'Weather_Condition': weather_condition,
117 |                           'Promotion_Type': promotion_type,
118 |                           'Website_Traffic': website_traffic,
119 |                           'Device_Type': device_type,
120 |                           'Daily_Sales': sales})
121 | 
122 | #introducing upstream processing error causing website traffic to be empty for bundle_deal and free_gift
123 | sales_data_month2_wrong.loc[sales_data_month2_wrong['Promotion_Type'] == 'bundle_deal', 'Website_Traffic'] = None
124 | sales_data_month2_wrong.loc[ sales_data_month2_wrong['Promotion_Type'] == 'free_gift', 'Website_Traffic'] = None
125 | 
126 | sales_data_month2_wrong.to_csv(f'/dbfs{raw_month2_bad_data_path}/data.csv', index=False)
127 | 
128 | # COMMAND ----------
129 | 
130 | #sales_data_month2_correct
131 | 
132 | # COMMAND ----------
133 | 
134 | # MAGIC %md
135 | # MAGIC ### Month 3
136 | # MAGIC
137 | # MAGIC **Scenario:**
138 | # MAGIC * A product campaign went viral on social media. Sales increased by 30% for each day. 
139 | # MAGIC   
140 | # MAGIC **What are we simulating here?**
141 | # MAGIC * Concept Drift
142 | 
143 | # COMMAND ----------
144 | 
145 | dates = pd.date_range('2023-03-01', '2023-03-31')
146 | num_days = len(dates)
147 | 
148 | # Generate independent feature data
149 | temperature = np.round(np.random.normal(loc=25, scale=5, size=num_days), 2)
150 | weather_condition = np.random.choice(['sunny', 'cloudy', 'rainy'], size=num_days, p=[0.5, 0.3, 0.2])
151 | promotion_type = np.random.choice(['discount', 'free_gift', 'bundle_deal'], size=num_days, p=[0.4, 0.3, 0.3])
152 | website_traffic = np.random.normal(loc=500, scale=100, size=num_days).astype(int)  # Generate website traffic as integers
153 | device_type = np.random.choice(['mobile', 'desktop', 'tablet'], size=num_days, p=[0.6, 0.3, 0.1])
154 | 
155 | #increase daily sales by 30%
156 | sales = np.round((1000 - 10*temperature + 5*website_traffic - 50*(weather_condition == 'rainy') - 100*(device_type == 'desktop')) * 1.3).astype(int)
157 | 
158 | # Create a pandas DataFrame to store the time series data
159 | sales_data_month3 = pd.DataFrame({'Date': dates,
160 |                           'Temperature': temperature,
161 |                           'Weather_Condition': weather_condition,
162 |                           'Promotion_Type': promotion_type,
163 |                           'Website_Traffic': website_traffic,
164 |                           'Device_Type': device_type,
165 |                           'Daily_Sales': sales})
166 | 
167 | 
168 | #sales_data_month3
169 | 
170 | # COMMAND ----------
171 | 
172 | merged_raw_df = pd.concat([sales_data_month1, sales_data_month2_correct, sales_data_month3])
173 | # Write the dataframe to a CSV file and give path to dbfs directory we created for storing the raw file.
174 | merged_raw_df.to_csv(f'/dbfs{raw_good_data_path}/data.csv', index=False)
175 | 


--------------------------------------------------------------------------------
/Chapter-04/mlflow-with-featurestore.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md # MLflow introduction.
  3 | # MAGIC
  4 | # MAGIC This tutorial covers an example of how to use the integrated MLflow tracking capabilities to track your model training with the integrated feature store.
  5 | # MAGIC   - Import data that was previously registered in the feature store table.
  6 | # MAGIC   - Create a baseline model for churn prediction and store it in the integrated MLflow tracking server.
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | # MAGIC %md
 11 | # MAGIC ###0. SETUP -- Databricks Spark cluster:  
 12 | # MAGIC
 13 | # MAGIC 1. **Create** a cluster by...  
 14 | # MAGIC   - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 
 15 | # MAGIC   - In `Policy` select `Unrestricted`.
 16 | # MAGIC   - Enter any text, i.e `demo` into the cluster name text box.
 17 | # MAGIC   - Select `Single Node` in the cluster mode.
 18 | # MAGIC   - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab.
 19 | # MAGIC   - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 20 | # MAGIC   - Click the `create cluster` button and wait for your cluster to be provisioned
 21 | # MAGIC 3. **Attach** this notebook to your cluster by...   
 22 | # MAGIC   - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | #install latest version of sklearn
 27 | %pip install -U scikit-learn
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | # MAGIC %md 
 32 | # MAGIC ### Step 1) Importing the desired libraries and defining few constants and creating training set from the registered feature table.
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | from databricks.feature_store import FeatureStoreClient
 37 | from databricks.feature_store import FeatureLookup
 38 | import typing
 39 | 
 40 | from sklearn import metrics
 41 | from sklearn.ensemble import RandomForestClassifier
 42 | from sklearn.model_selection import train_test_split
 43 | import mlflow
 44 | import pandas as pd
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | #Name of the model
 49 | MODEL_NAME = "random_forest_classifier_featurestore"
 50 | #This is the name for the entry in model registry
 51 | MODEL_REGISTRY_NAME = "Bank_Customer_Churn"
 52 | #The email you use to authenticate in the Databricks workspace
 53 | USER_EMAIL = "debu.sinha@databricks.com"
 54 | #Location where the MLflow experiement will be listed in user workspace
 55 | EXPERIMENT_NAME = f"/Users/{USER_EMAIL}/Bank_Customer_Churn_Analysis"
 56 | # we have all the features backed into a Delta table so we will read directly
 57 | FEATURE_TABLE = "bank_churn_analysis.bank_customer_features"
 58 | 
 59 | 
 60 | # COMMAND ----------
 61 | 
 62 | 
 63 | # this code is just for demonstration and you can utilize this as starting point and build more errorhandling around it.
 64 | class Feature_Lookup_Input_Tuple(typing.NamedTuple):
 65 |   fature_table_name: str
 66 |   feature_list: typing.Union[typing.List[str], None] 
 67 |   lookup_key: typing.List[str]
 68 | 
 69 | # this code is going to generate feature look up based on on the list of feature mappings provided.
 70 | def generate_feature_lookup(feature_mapping: typing.List[Feature_Lookup_Input_Tuple]) -> typing.List[FeatureLookup]:  
 71 |   lookups = []
 72 |   for fature_table_name, feature_list, lookup_key in feature_mapping:
 73 |     lookups.append(
 74 |           FeatureLookup(
 75 |           table_name = fature_table_name,
 76 |           feature_names = feature_list,
 77 |           lookup_key = lookup_key 
 78 |       )
 79 |     )
 80 |   return lookups
 81 | 
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # MAGIC %md
 86 | # MAGIC ### Step 2) Build a simplistic model that uses the feature store table as its source for training and validation.
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | #initialize the feature store client
 91 | fs = FeatureStoreClient()
 92 | mlflow.set_experiment(EXPERIMENT_NAME)
 93 | 
 94 | with mlflow.start_run():  
 95 |   TEST_SIZE = 0.20
 96 |   
 97 |   #define the list of features we want to get from feature table
 98 |   #If we have to combine data from multiple feature tables then we can provide multiple mappings for feature tables 
 99 |   features = [Feature_Lookup_Input_Tuple(FEATURE_TABLE,["CreditScore" , "Age", "Tenure",\
100 |               "Balance", "NumOfProducts", "HasCrCard",\
101 |               "IsActiveMember", "EstimatedSalary", "Geography_Germany",\
102 |               "Geography_Spain", "Gender_Male"], ["CustomerId"] )]
103 | 
104 |   lookups = generate_feature_lookup(features)
105 |   
106 |   #Now we will simulate receiving only ID's of customers and the label as input at the  time of inference
107 |   training_df = spark.table(FEATURE_TABLE).select("CustomerId", "Exited")
108 |   
109 |   #Using the training set we will combine the training dataframe with the features stored in the feature tables.
110 |   training_data = fs.create_training_set(
111 |     df=training_df,
112 |     feature_lookups=lookups,
113 |     label="Exited",
114 |     exclude_columns=['CustomerId']
115 |   )
116 |   
117 |   #convert the dataset to pandas so that we can fit sklearn RandomForestClassifier on it
118 |   train_df = training_data.load_df().toPandas()
119 |   
120 |   #The train_df represents the input dataframe that has all the feature columns along with the new raw input in the form of training_df.
121 |   X = train_df.drop(['Exited'], axis=1)
122 |   y = train_df['Exited']
123 |   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=54, stratify=y)
124 |   
125 |   #here we will are not doing any hyperparameter tuning however, in future we will see how to perform hyperparameter tuning in scalable manner on Databricks.
126 |   model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
127 |   signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train))
128 |   
129 |   predictions = model.predict(X_test)
130 |   fpr, tpr, _ = metrics.roc_curve(y_test, predictions, pos_label=1)
131 |   auc = metrics.auc(fpr, tpr)
132 |   accuracy = metrics.accuracy_score(y_test, predictions)
133 |  
134 |   #get the calculated feature importances.
135 |   importances = dict(zip(model.feature_names_in_, model.feature_importances_))  
136 |   #log artifact
137 |   mlflow.log_dict(importances, "feature_importances.json")
138 |   #log metrics
139 |   mlflow.log_metric("auc", auc)
140 |   mlflow.log_metric("accuracy", accuracy)
141 |   #log parameters
142 |   mlflow.log_param("split_size", TEST_SIZE)
143 |   mlflow.log_params(model.get_params())
144 |   #set tag
145 |   mlflow.set_tag(MODEL_NAME, "mlflow and feature store demo")
146 |   #log the model itself in mlflow tracking server
147 |   mlflow.sklearn.log_model(model, MODEL_NAME, signature=signature, input_example=X_train.iloc[:4, :])
148 | 
149 |   # finally to make the feature store track what features are being used by our model we call log_model with the feature store client
150 |   fs.log_model(
151 |     model,
152 |     MODEL_NAME,
153 |     flavor=mlflow.sklearn,
154 |     training_set=training_data,
155 |     registered_model_name=MODEL_REGISTRY_NAME
156 |   )
157 |   
158 |   
159 | 
160 | # COMMAND ----------
161 | 
162 | # MAGIC %md
163 | # MAGIC ### Step 3) Now that we have the model logged to the MLflow tracking server, we can get the latest version from the experiment and use it.
164 | 
165 | # COMMAND ----------
166 | 
167 | from mlflow.tracking import MlflowClient
168 | #initialize the mlflow client
169 | client = MlflowClient()
170 | #get the experiment id 
171 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
172 | #get the latest run id which will allow us to directly access the metrics, and attributes and all th einfo
173 | run_id = mlflow.search_runs(experiment_id, order_by=["start_time DESC"]).head(1)["run_id"].values[0]
174 | 
175 | # COMMAND ----------
176 | 
177 | # MAGIC %md
178 | # MAGIC - With the feature store registration associated with the MLflow model, we don't have to specify any data loading and processing to happen other than a point to the raw data that features will be calculated from. 
179 | # MAGIC - We can do batch predictions simply by accessing the feature store instance, providing the run_id and the model's name (MODEL_NAME below) with the raw data specified as the second argument. 
180 | # MAGIC - If we want to provide new values for certain feature that is already part of the feature table, just include it in the new dataframe that we want to perform the prediction on.
181 | 
182 | # COMMAND ----------
183 | 
184 | #at the time of infernce you can provide just the CustomerId. This is the key that will perform all the lookup for the features automatically.
185 | predictions = fs.score_batch(f"runs:/{run_id}/{MODEL_NAME}", spark.table(FEATURE_TABLE).select("CustomerId"))
186 | 
187 | # COMMAND ----------
188 | 
189 | display(predictions)
190 | 
191 | # COMMAND ----------
192 | 
193 | # MAGIC %md
194 | # MAGIC ##Cleanup
195 | 
196 | # COMMAND ----------
197 | 
198 | #Uncomment to lines below and execute for cleaning up.
199 | '''
200 | from mlflow.tracking import MlflowClient
201 | 
202 | #get all the information about the current experiment
203 | experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
204 | 
205 | #list all the runs that are part of this experiment and delete them
206 | runs = mlflow.list_run_infos(experiment_id=experiment_id)
207 | for run in runs:
208 |   mlflow.delete_run(run_id = run.run_id)
209 | 
210 | #finally delete the experiment  
211 | mlflow.delete_experiment(experiment_id=experiment_id)  
212 | 
213 | client = MlflowClient()
214 | #delete the model registered in the registry to clear the linkage in thefeature store
215 | client.delete_registered_model(name=MODEL_REGISTRY_NAME)
216 | '''
217 | 


--------------------------------------------------------------------------------
/Chapter-07/batch-and-streaming.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC %md
  4 | # MAGIC ## Author
  5 | # MAGIC
  6 | # MAGIC - **Debu Sinha**
  7 | # MAGIC
  8 | # MAGIC ## Tested Environment
  9 | # MAGIC
 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 13 | # MAGIC
 14 | # MAGIC ## Cluster Setup Instructions
 15 | # MAGIC
 16 | # MAGIC 1. **Create a Cluster**: 
 17 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 18 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 19 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 20 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 21 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 22 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 23 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 24 | # MAGIC
 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 26 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 27 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 28 | # MAGIC
 29 | # MAGIC ## Batch Deployment
 30 | # MAGIC
 31 | # MAGIC This notebook will go over the most common model deployment option of batch inferencing. We will load the latest model version for our <b>Bank customer churn prediction</b> problem from the model registry and load it as a python function that can be applied to a Spark Dataframe.
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | # MAGIC %md ### Inference in Spark
 36 | # MAGIC
 37 | # MAGIC Till now we have seen how you can use differnent machine learning libraries to train your model. When it comes to deployment we can now utilize to power of Spark to distribute our trained model to more than a single node and do predictions at scale.
 38 | # MAGIC
 39 | # MAGIC To do this, we will use `mlflow.pyfunc.spark_udf` and pass in the `SparkSession`, name of the model, and run id.
 40 | # MAGIC
 41 | # MAGIC <b>Note:</b> Using UDF's in Spark means that supporting libraries must be installed on every node in the cluster.  In the case of `sklearn`, this is installed in Databricks clusters by default.  When using other libraries, you will need to install them to ensure that they will work as UDFs.  
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | # MAGIC %md
 46 | # MAGIC First we will load the desired model from the model registry.
 47 | 
 48 | # COMMAND ----------
 49 | 
 50 | import mlflow
 51 | 
 52 | # the name of the model in the registry
 53 | registry_model_name = "Churn Prediction Bank"
 54 | 
 55 | # get the latest version of the model in staging and load it as a spark_udf.
 56 | # MLflow easily produces a Spark user defined function (UDF).  This bridges the gap between Python environments and applying models at scale using Spark.
 57 | model = mlflow.pyfunc.spark_udf(spark, model_uri=f"models:/{registry_model_name}/staging")
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | # MAGIC %md
 62 | # MAGIC This model was trained on raw dataset and using the Databricks AutoML. 
 63 | # MAGIC
 64 | # MAGIC <b>Note:</b> Make sure the dataset we want to run infrence on matches the schema of the dataset the model was trained on. In the current example we will simply reuse the dataset we used to train our model.
 65 | # MAGIC - As best practice keep all the model specific transformations like imputing missing values or scaling a column value should be done as part of the model pipelne and not when registering a table as feature table.
 66 | 
 67 | # COMMAND ----------
 68 | 
 69 | spark_df = spark.table("bank_churn_analysis.raw_Data")
 70 | display(spark_df)
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | # MAGIC %md
 75 | # MAGIC <b>Note:</b> we will not send RowNumber, CustomerId, Surname and Exited columns to the model.
 76 | 
 77 | # COMMAND ----------
 78 | 
 79 | exclude_colums = {'RowNumber', "CustomerId", "Surname", "Exited"}
 80 | input_columns = [col for col in spark_df.columns if col not in exclude_colums]
 81 | input_columns
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # MAGIC %md Apply the model as a standard UDF using the column names as the input to the function.
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | #passing non label columns to the model as input
 90 | prediction_df = spark_df.withColumn("prediction", model(*input_columns))
 91 | 
 92 | display(prediction_df)
 93 | 
 94 | # COMMAND ----------
 95 | 
 96 | # MAGIC %md
 97 | # MAGIC <b>Now you can write the inference out to a database for fast access, to a Delta table, or any other file format depending on your application need.</b>
 98 | 
 99 | # COMMAND ----------
100 | 
101 | # MAGIC %md
102 | # MAGIC __Note:__ In the above example we showcased how you can use mlflow API to perform batch inference. We didnt make use of the model trained on feature table that we created in Chapter 2. If you  want to utilize feature store API to log a trained model and also perform the batch inference check the notebook in Chapter 4 that has details on that.
103 | 
104 | # COMMAND ----------
105 | 
106 | # MAGIC %md # Streaming Deployment
107 | 
108 | # COMMAND ----------
109 | 
110 | # MAGIC %md
111 | # MAGIC We can also perform continuous model inference using a technology like Spark's Structured Streaming. you can read more about this [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html). Using Spark for ingesting and building your Streaming ingestion pipelines and model insfrence solution is that:
112 | # MAGIC - It offers the same Dataframe API to processing streaming data as you would use with batch data.
113 | # MAGIC - provides a scalable and fault tolerant way to continuously perform inference on incoming new data.
114 | # MAGIC
115 | # MAGIC We will not go into detail of Spark structured streaming here but will cover how you can deploy model for inference on a stream of data.
116 | # MAGIC
117 | # MAGIC The first is usually to connect to a streaming data source like Kafka, Azure event bus or Kinesis. Using Spark structured streaming you can also simulate reading files as stream from a cloud storage like S3. For our example we are going to do just that.
118 | # MAGIC
119 | # MAGIC We'll read Delta table as a stream.
120 | 
121 | # COMMAND ----------
122 | 
123 | # right now we are just defining a streaming data source but this statement will not execute until we call an Spark action.
124 | raw_streaming_df = spark.readStream.format("delta").option("ignoreChanges", "true").table("bank_churn_analysis.raw_Data").drop(*("RowNumber", "CustomerId", "Surname", "Exited"))
125 | 
126 | # if you want to read from a S3 location then use the next set of code
127 | # streaming_data = (spark
128 | #                  .readStream
129 | #                  .schema(schema)
130 | #                  .option("maxFilesPerTrigger", 1)
131 | #                  .parquet("<location of parquet file>")
132 | #                  .drop(*("RowNumber", "CustomerId", "Surname", "Exited")))
133 | 
134 | # COMMAND ----------
135 | 
136 | # we will use this to keep track of our streaming job
137 | stream_name = "streaming_inference"
138 | 
139 | # COMMAND ----------
140 | 
141 | predictions_df = raw_streaming_df.withColumn("prediction", model(*raw_streaming_df.columns))
142 | display(predictions_df, streamName=stream_name)
143 | 
144 | # COMMAND ----------
145 | 
146 | # Spark structured stream takes some time to finish initializing and trying to shut it off will throw an error if its not active. This code will prevent it.
147 | active_streams = [stream.name for stream in spark.streams.active]
148 | active_streams
149 | 
150 | import time
151 | start_time = time.time()
152 | while stream_name not in active_streams:
153 |   time.sleep(5)
154 |   # wait for 20 seconds to let the strem initialize
155 |   if time.time()-start_time>20:
156 |     # stream initialization was not kicked off or there is some network issue.
157 |     break
158 | 
159 | # COMMAND ----------
160 | 
161 | # We will stop the stream after reviewing results
162 | for stream in spark.streams.active:
163 |     print(f"Stopping {stream.name}")
164 |     stream.stop() # Stop the stream
165 | 
166 | # COMMAND ----------
167 | 
168 | # MAGIC %md
169 | # MAGIC
170 | # MAGIC ### Write to Delta table
171 | 
172 | # COMMAND ----------
173 | 
174 | working_dir = "/tmp"
175 | # this is important for streaming queries to keep track of what records have been processed and guyrantee each record is processed only once.
176 | checkpoint_location = f"{working_dir}/stream.checkpoint"
177 | # this is a temporary location where we will write the predictions of our model as Delta table
178 | write_path = f"{working_dir}/predictions"
179 | 
180 | (predictions_df
181 |     .writeStream                                           # Write the stream
182 |     .queryName(stream_name)                                # Name the query
183 |     .format("delta")                                       # Use the delta format
184 |     .option("checkpointLocation", checkpoint_location)     # Specify where to log metadata
185 |     .option("path", write_path)                            # Specify the output path
186 |     .outputMode("append")                                  # "append" means append the new data to the table
187 |     .start()                                               # Start the operation
188 | )
189 | 
190 | # COMMAND ----------
191 | 
192 | # MAGIC %md
193 | # MAGIC we can take a look at what files are written to the file system
194 | 
195 | # COMMAND ----------
196 | 
197 | # MAGIC %fs
198 | # MAGIC ls /tmp/predictions/
199 | 
200 | # COMMAND ----------
201 | 
202 | # MAGIC %sql
203 | # MAGIC select * from delta.`/tmp/predictions`
204 | 
205 | # COMMAND ----------
206 | 
207 | # We will stop the stream after writing the data to the delta table
208 | for stream in spark.streams.active:
209 |     print(f"Stopping {stream.name}")
210 |     stream.stop() # Stop the stream
211 | 


--------------------------------------------------------------------------------
/Chapter-03/churn-analysis.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC * [**Customer Churn**](https://en.wikipedia.org/wiki/Customer_attrition) also known as Customer attrition, customer turnover, or customer defection, is the loss of clients or customers and is...  
  4 | # MAGIC   * Built on top of Databricks Platform
  5 | # MAGIC   * Uses Databricks ML runtime and Feature store
  6 | # MAGIC * This Notebook...  
  7 | # MAGIC   * We will use Customer Churn dataset from the [Kaggle](https://www.kaggle.com/mathchi/churn-for-bank-customers).
  8 | # MAGIC   * We will skip the EDA part and focus on the feature engineering part and registering feature tables into Databricks feature store.
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %md
 13 | # MAGIC ###0. SETUP -- Databricks Spark cluster:  
 14 | # MAGIC
 15 | # MAGIC 1. **Create** a cluster by...  
 16 | # MAGIC   - Click the `Compute` icon on the left sidebar and then `Create Cluster.` 
 17 | # MAGIC   - In `Policy` select `Unrestricted`.
 18 | # MAGIC   - Enter any text, i.e `demo` into the cluster name text box.
 19 | # MAGIC   - Select `Single Node` in the cluster mode.
 20 | # MAGIC   - Select the `Databricks runtime version` value `13.3 LTS (Scala 2.12, Spark 3.4.1)` from the `ML` tab.  
 21 | # MAGIC   - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 22 | # MAGIC   - Click the `create cluster` button and wait for your cluster to be provisioned
 23 | # MAGIC 3. **Attach** this notebook to your cluster by...   
 24 | # MAGIC   - Click on your cluster name in menu `Detached` at the top left of this workbook to attach it to this workbook 
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md 
 29 | # MAGIC ###Step1: Ingest Data to Notebook
 30 | # MAGIC
 31 | # MAGIC We will download the dataset hosted at  [**Kaggle**](https://www.kaggle.com/mathchi/churn-for-bank-customers)
 32 | # MAGIC
 33 | # MAGIC ## Content
 34 | # MAGIC *   `RowNumber` —corresponds to the record (row) number and has no effect on the output.
 35 | # MAGIC *   `CustomerId` -contains random values and has no effect on customer leaving the bank.
 36 | # MAGIC *   `Surname` —the surname of a customer has no impact on their decision to leave the bank.
 37 | # MAGIC *   `CreditScore` —can have an effect on customer churn, since a customer with a higher credit score is less likely to leave the bank.
 38 | # MAGIC *   `Geography` —a customer’s location can affect their decision to leave the bank.
 39 | # MAGIC *   `Gender` —it’s interesting to explore whether gender plays a role in a customer leaving the bank
 40 | # MAGIC *   `Age` —this is certainly relevant, since older customers are less likely to leave their bank than younger ones.
 41 | # MAGIC *   `Tenure` —refers to the number of years that the customer has been a client of the bank. Normally, older clients are more loyal and less likely to leave a bank
 42 | # MAGIC *   `Balance` —also a very good indicator of customer churn, as people with a higher balance in their accounts are less likely to leave the bank compared to those with lower balances.
 43 | # MAGIC *   `NumOfProducts` —refers to the number of products that a customer has purchased through the bank.
 44 | # MAGIC *   `HasCrCard` —denotes whether or not a customer has a credit card. This column is also relevant, since people with a credit card are less likely to leave the bank.
 45 | # MAGIC *   `IsActiveMember` —active customers are less likely to leave the bank
 46 | # MAGIC *   `EstimatedSalary` —as with balance, people with lower salaries are more likely to leave the bank compared to those with higher salaries.
 47 | # MAGIC *   `Exited` —whether or not the customer left the bank.
 48 | # MAGIC
 49 | # MAGIC ## Acknowledgements
 50 | # MAGIC
 51 | # MAGIC As we know, it is much more expensive to sign in a new client than keeping an existing one.
 52 | # MAGIC It is advantageous for banks to know what leads a client towards the decision to leave the company.
 53 | # MAGIC Churn prevention allows companies to develop loyalty programs and retention campaigns to keep as many customers as possible.
 54 | # MAGIC
 55 | # MAGIC Data= https://www.kaggle.com/mathchi/churn-for-bank-customers 
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | # MAGIC %md
 60 | # MAGIC ## Import Data
 61 | # MAGIC
 62 | # MAGIC Next, we'll import our data for this part
 63 | 
 64 | # COMMAND ----------
 65 | 
 66 | #read more about reading files from Databricks repos at https://docs.databricks.com/repos.html#access-files-in-a-repo-programmatically
 67 | import os
 68 | bank_df = spark.read.option("header", True).option("inferSchema", True).csv(f"file:{os.getcwd()}/data/churn.csv")
 69 | display(bank_df)
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | # MAGIC %md
 74 | # MAGIC We can drop RowNumber in the feature engineering step as this is not adding any valuable information.
 75 | # MAGIC
 76 | # MAGIC **Note:**  
 77 | # MAGIC Databricks introduced a built in data profiler for spark dataframes. The built in function display now gives an option to profile data automatically
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | display(bank_df)
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # MAGIC %md Lets get unique value count in Surname
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | bank_df.select('Surname').distinct().count()
 90 | 
 91 | # COMMAND ----------
 92 | 
 93 | # MAGIC %md
 94 | # MAGIC As we can see Surname column have a lot of unique values and is not adding any useful information for us so we will drop it in our feature engineering step.
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | # MAGIC %md
 99 | # MAGIC ## Create Feature Table
100 | # MAGIC
101 | # MAGIC Next, we can use the DataFrame **`bank_df`** to create a feature table using Feature Store.
102 | # MAGIC
103 | # MAGIC **In order to write our features out as a feature table we will perform the following steps:**
104 | # MAGIC 1. Create a Database that will store any feature table. In our case let that be `bank_churn_analysis`
105 | # MAGIC 1. Write the Python functions to compute the features. The output of each function should be an Apache Spark DataFrame with a unique primary key. The primary key can consist of one or more columns.
106 | # MAGIC 1. Create a feature table by instantiating a FeatureStoreClient and using create_table (Databricks Runtime 10.2 ML or above) or create_feature_table (Databricks Runtime 10.1 ML or below).
107 | # MAGIC 1. Populate the feature table using write_table.
108 | # MAGIC
109 | # MAGIC Note: 
110 | # MAGIC - **If you want to prevent any data leakage you would want to consider not performing OHE or any feature treatment at the time of registering dataset as a feature table. **
111 | 
112 | # COMMAND ----------
113 | 
114 | # MAGIC %md
115 | # MAGIC ## 1. Defining a database to store feature tables.
116 | 
117 | # COMMAND ----------
118 | 
119 | DATABASE_NAME = "bank_churn_analysis"
120 | #setup database that will hold our Feature tables in Delta format.
121 | spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
122 | 
123 | # COMMAND ----------
124 | 
125 | # MAGIC %md
126 | # MAGIC write the raw data out as a delta table
127 | 
128 | # COMMAND ----------
129 | 
130 | bank_df.write.format("delta").mode("overwrite").saveAsTable(f"{DATABASE_NAME}.raw_data")
131 | 
132 | # COMMAND ----------
133 | 
134 | # MAGIC %md
135 | # MAGIC ## 2. Defining a feature engineering function that will return a Spark dataframe with a unique primary key. 
136 | # MAGIC In our case it is the `CustomerId`.
137 | 
138 | # COMMAND ----------
139 | 
140 | # MAGIC %md
141 | # MAGIC
142 | # MAGIC The `bank_df` DataFrame is already pretty clean, but we do have some nominal features that we'll need to convert to numeric features for modeling.
143 | # MAGIC
144 | # MAGIC These features include:
145 | # MAGIC
146 | # MAGIC * **`Geography`**
147 | # MAGIC * **`Gender`**
148 | # MAGIC
149 | # MAGIC We will also be dropping few features which dont add additional value for our model:
150 | # MAGIC * **`RowNumber`**
151 | # MAGIC * **`Surname`**
152 | # MAGIC
153 | # MAGIC ### Create `compute_features` Function
154 | # MAGIC
155 | # MAGIC A lot of data scientists are familiar with Pandas DataFrames, so we'll use the [pyspark.pandas](https://spark.apache.org/docs/3.2.0/api/python/user_guide/pandas_on_spark/) library to one-hot encode these categorical features.
156 | # MAGIC
157 | # MAGIC **Note:** we are creating a function to perform these computations. We'll use it to refer to this set of instructions when creating our feature table.
158 | 
159 | # COMMAND ----------
160 | 
161 | 
162 | import pyspark.pandas as ps
163 | import numpy as np
164 | 
165 | def compute_features(spark_df):
166 |     # https://spark.apache.org/docs/latest/api/python/migration_guide/koalas_to_pyspark.html?highlight=dataframe%20pandas_api
167 |     # Convert to pyspark.pandas DataFrame
168 |     ps_df = spark_df.pandas_api()
169 |     
170 |     # Drop RowNumber & Surname column
171 |     ps_df = ps_df.drop(['RowNumber', 'Surname'], axis=1)
172 |     
173 |     # One-Hot Encoding for Geography and Gender
174 |     ohe_ps_df = ps.get_dummies(
175 |       ps_df, 
176 |       columns=["Geography", "Gender"],
177 |       dtype="int",
178 |       drop_first=True
179 |     )
180 |     
181 |     # Clean up column names
182 |     ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r' ', '', regex=True)
183 |     ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r'(', '-', regex=True)
184 |     ohe_ps_df.columns = ohe_ps_df.columns.str.replace(r')', '', regex=True)
185 |     
186 |     ## Additional example feature engineering steps
187 | 
188 |     # # Create a binary feature indicating whether the balance is zero or not
189 |     # ohe_ps_df['Is_Balance_Zero'] = (ohe_ps_df['Balance'] == 0).astype('int')
190 |     
191 |     # # Ratio of Tenure to Age
192 |     # ohe_ps_df['Tenure_to_Age'] = ohe_ps_df['Tenure'] / ohe_ps_df['Age']
193 |     
194 |     # # Interaction feature: Balance to EstimatedSalary ratio
195 |     # ohe_ps_df['Balance_to_Salary'] = ohe_ps_df['Balance'] / ohe_ps_df['EstimatedSalary']
196 |     
197 |     return ohe_ps_df
198 | 
199 | 
200 | # COMMAND ----------
201 | 
202 | # MAGIC %md
203 | # MAGIC ### Compute Features
204 | # MAGIC
205 | # MAGIC Next, we can use our featurization function `compute_features` to create create a DataFrame of our features.
206 | 
207 | # COMMAND ----------
208 | 
209 | bank_features_df = compute_features(bank_df)
210 | display(bank_features_df)
211 | 
212 | # COMMAND ----------
213 | 
214 | # MAGIC %md
215 | # MAGIC ##3. Create the Feature Table
216 | # MAGIC
217 | # MAGIC Next, we can use the `feature_table` operation to register the DataFrame as a Feature Store table.
218 | # MAGIC
219 | # MAGIC In order to do this, we'll want the following details:
220 | # MAGIC
221 | # MAGIC 1. The `name` of the database and table where we want to store the feature table
222 | # MAGIC 1. The `keys` for the table
223 | # MAGIC 1. The `schema` of the table
224 | # MAGIC 1. A `description` of the contents of the feature table
225 | # MAGIC 1. `partition_columns`- Column(s) used to partition the feature table.
226 | # MAGIC 1. `features_df`(optional) - Data to insert into this feature table. The schema of features_df will be used as the feature table schema.
227 | # MAGIC
228 | # MAGIC **Note:** 
229 | # MAGIC 1. This creates our feature table, but we still need to write our values in the DataFrame to the table. 
230 | 
231 | # COMMAND ----------
232 | 
233 | #Our first step is to instantiate the feature store client using `FeatureStoreClient()`.
234 | from databricks.feature_store import FeatureStoreClient
235 | fs = FeatureStoreClient()
236 | 
237 | # COMMAND ----------
238 | 
239 | # MAGIC %md
240 | # MAGIC We have __2__ options to initialize a feature table.
241 | # MAGIC
242 | # MAGIC 1. Providing Dataframe to populate feature table at time of defining feature table. This approach can be used when you have a feature dataframe ready to instantiate a feature table.
243 | # MAGIC ``` 
244 | # MAGIC bank_feature_table = fs.create_table(
245 | # MAGIC   name=f"{DATABASE_NAME}.bank_customer_features", # the name of the feature table
246 | # MAGIC   primary_keys=["CustomerId"], # primary key that will be used to perform joins
247 | # MAGIC   schema=bank_features_df.spark.schema(), # the schema of the Feature table
248 | # MAGIC   description="This customer level table contains one-hot encoded categorical and scaled numeric features to predict bank customer churn.",
249 | # MAGIC   feature_df=bank_features_df.to_spark() 
250 | # MAGIC )
251 | # MAGIC ```
252 | # MAGIC 2. In second case you can provide definition of the feature table without providing a source dataframe. This approach can be used when your data to populate feature store will be ingested at a different time then when you are defining the feature table. We will be showcasing this approach as part of the notebook.
253 | 
254 | # COMMAND ----------
255 | 
256 | bank_feature_table = fs.create_table(
257 |   name=f"{DATABASE_NAME}.bank_customer_features", # the name of the feature table
258 |   primary_keys=["CustomerId"], # primary key that will be used to perform joins
259 |   schema=bank_features_df.spark.schema(), # the schema of the Feature table
260 |   description="This customer level table contains one-hot encoded categorical and scaled numeric features to predict bank customer churn."
261 | )
262 | 
263 | # COMMAND ----------
264 | 
265 | # MAGIC %md
266 | # MAGIC ## 4. Populate the feature table using write_table.
267 | # MAGIC Now, we can write the records from **`bank_features_df`** to the feature table.
268 | 
269 | # COMMAND ----------
270 | 
271 | fs.write_table(df=bank_features_df.to_spark(), name=f"{DATABASE_NAME}.bank_customer_features", mode="overwrite")
272 | #instead of overwrite you can choose "merge" as an option if you want to update only certain records.
273 | 
274 | # COMMAND ----------
275 | 
276 | # MAGIC %md
277 | # MAGIC ##5. Browsing the Feature Store
278 | # MAGIC
279 | # MAGIC The tables are now visible and searchable in the [Feature Store](/#feature-store/feature-store)
280 | 
281 | # COMMAND ----------
282 | 
283 | # MAGIC %md
284 | # MAGIC Optionally if your usecase requires joining features for real time inference, you can write your features out to an [online store](https://docs.databricks.com/applications/machine-learning/feature-store.html#publish-features-to-an-online-feature-store).
285 | # MAGIC
286 | # MAGIC And finally, we can perform Access Control using built-in features in the Feature Store UI.
287 | 
288 | # COMMAND ----------
289 | 
290 | # MAGIC %md
291 | # MAGIC ### Cleanup
292 | 
293 | # COMMAND ----------
294 | 
295 | #Drop feature table. This will drop the underlying Delta table as well.
296 | 
297 | # fs.drop_table(
298 | #   name=f"{DATABASE_NAME}.bank_customer_features"
299 | # )
300 | 
301 | # COMMAND ----------
302 | 
303 | # MAGIC %md
304 | # MAGIC Note: <b>In you decide to drop table from UI follow the follwing steps.</b>.
305 | # MAGIC
306 | # MAGIC Follow the following steps:
307 | # MAGIC - Go to [Feature Store](/#feature-store/feature-store)
308 | # MAGIC - Select the feature tables and select `delete` after clicking on 3 vertical dots icon.
309 | # MAGIC
310 | # MAGIC Deleting the feature tables in this way requires you to manually delete the published online tables and the underlying Delta table separately. 
311 | 
312 | # COMMAND ----------
313 | 
314 | 
315 | 


--------------------------------------------------------------------------------
/Chapter-07/real-time-additional.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ## Author
  4 | # MAGIC
  5 | # MAGIC - **Debu Sinha**
  6 | # MAGIC
  7 | # MAGIC ## Tested Environment
  8 | # MAGIC
  9 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 10 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 11 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 12 | # MAGIC
 13 | # MAGIC ## Cluster Setup Instructions
 14 | # MAGIC
 15 | # MAGIC 1. **Create a Cluster**: 
 16 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 17 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 18 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 19 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 20 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 21 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 22 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 23 | # MAGIC
 24 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 25 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 26 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 27 | 
 28 | # COMMAND ----------
 29 | 
 30 | # MAGIC %pip install assertpy
 31 | 
 32 | # COMMAND ----------
 33 | 
 34 | # the name of model in model registry you want to serve with serving endpoint.
 35 | model_name = "Churn Prediction Bank"
 36 | 
 37 | # serving endpoint name
 38 | model_serving_endpoint_name = "churn_prediction_api_deployment"
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md
 43 | # MAGIC
 44 | # MAGIC
 45 | # MAGIC ---
 46 | # MAGIC
 47 | # MAGIC # Code Documentation for Token and Header Setup in Databricks
 48 | # MAGIC
 49 | # MAGIC This document provides an in-depth overview of the code that fetches the API token from a Databricks notebook, sets up the authorization header for REST API calls, and retrieves the Databricks instance URL.
 50 | # MAGIC
 51 | # MAGIC ---
 52 | # MAGIC
 53 | # MAGIC ## Code Sections
 54 | # MAGIC
 55 | # MAGIC ### 1. Fetch API Token from Databricks Notebook
 56 | # MAGIC
 57 | # MAGIC #### Purpose
 58 | # MAGIC
 59 | # MAGIC - Fetches the Databricks API token from the current notebook's context.
 60 | # MAGIC
 61 | # MAGIC #### Code Explanation
 62 | # MAGIC
 63 | # MAGIC - `token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)`
 64 | # MAGIC
 65 | # MAGIC #### Libraries Used
 66 | # MAGIC
 67 | # MAGIC - `dbutils`: Databricks utility to interact with Databricks services.
 68 | # MAGIC
 69 | # MAGIC ---
 70 | # MAGIC
 71 | # MAGIC ### 2. Create Authorization Headers
 72 | # MAGIC
 73 | # MAGIC #### Purpose
 74 | # MAGIC
 75 | # MAGIC - Sets up the headers required for authorization and content-type in REST API calls.
 76 | # MAGIC
 77 | # MAGIC #### Code Explanation
 78 | # MAGIC
 79 | # MAGIC - `headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" }`
 80 | # MAGIC
 81 | # MAGIC #### Libraries Used
 82 | # MAGIC
 83 | # MAGIC - None.
 84 | # MAGIC
 85 | # MAGIC ---
 86 | # MAGIC
 87 | # MAGIC ### 3. Fetch Databricks Instance URL
 88 | # MAGIC
 89 | # MAGIC #### Purpose
 90 | # MAGIC
 91 | # MAGIC - Retrieves the Databricks instance URL for further API calls.
 92 | # MAGIC
 93 | # MAGIC #### Code Explanation
 94 | # MAGIC
 95 | # MAGIC 1. `java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()`: Fetches the notebook's tags as a Java object.
 96 | # MAGIC 2. `tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags)`: Converts the Java tags object to a Python dictionary.
 97 | # MAGIC 3. `instance = tags["browserHostName"]`: Extracts the Databricks instance (domain name) from the tags dictionary.
 98 | # MAGIC
 99 | # MAGIC #### Libraries Used
100 | # MAGIC
101 | # MAGIC - `dbutils`: Databricks utility.
102 | # MAGIC - `sc._jvm.scala.collection.JavaConversions`: Scala library for Java to Python type conversion.
103 | # MAGIC
104 | # MAGIC ---
105 | # MAGIC
106 | # MAGIC
107 | 
108 | # COMMAND ----------
109 | 
110 | # get token from notebook
111 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
112 | 
113 | #create authorization header for REST calls
114 | headers = {
115 |     "Authorization": f"Bearer {token}",
116 |     "Content-Type": "application/json"
117 |   }
118 |  
119 | 
120 | # Next we need an enpoint at which to execute our request which we can get from the Notebook's tags collection
121 | java_tags = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()
122 | 
123 | # This object comes from the Java CM
124 | tags = sc._jvm.scala.collection.JavaConversions.mapAsJavaMap(java_tags)
125 | 
126 | # extract the databricks instance (domain name) from the dictionary
127 | instance = tags["browserHostName"]
128 | 
129 | # COMMAND ----------
130 | 
131 | # MAGIC %md
132 | # MAGIC
133 | # MAGIC
134 | # MAGIC ---
135 | # MAGIC
136 | # MAGIC # Code Documentation for `get_latest_model_version` Function
137 | # MAGIC
138 | # MAGIC This document offers a comprehensive overview of the `get_latest_model_version` function which retrieves the latest version number of a specified model from MLflow's model registry.
139 | # MAGIC
140 | # MAGIC ---
141 | # MAGIC
142 | # MAGIC ## Function Overview
143 | # MAGIC
144 | # MAGIC ### `get_latest_model_version`
145 | # MAGIC
146 | # MAGIC Retrieves the latest version of a given model from the MLflow model registry.
147 | # MAGIC
148 | # MAGIC ---
149 | # MAGIC
150 | # MAGIC ## Detailed Function Description
151 | # MAGIC
152 | # MAGIC ### Function: `get_latest_model_version`
153 | # MAGIC
154 | # MAGIC #### Purpose
155 | # MAGIC
156 | # MAGIC - Fetches the most recent version of a specified model from the MLflow model registry.
157 | # MAGIC
158 | # MAGIC #### Parameters
159 | # MAGIC
160 | # MAGIC - `model_name`: Name of the model for which the latest version is to be fetched.
161 | # MAGIC
162 | # MAGIC #### Process
163 | # MAGIC
164 | # MAGIC 1. **Import MlflowClient**: Imports the `MlflowClient` class from the `mlflow.tracking.client` module.
165 | # MAGIC 2. **Initialize MLflow Client**: Instantiates an `MlflowClient` object.
166 | # MAGIC 3. **Retrieve Latest Model Versions**: Uses the `get_latest_versions` method to fetch the latest versions of the model. Only considers versions in the "None" stage.
167 | # MAGIC 4. **Iterate and Store Model Version**: Iterates through the returned model versions and extracts their version numbers.
168 | # MAGIC 5. **Return Latest Version**: Returns the most recent version number of the model.
169 | # MAGIC
170 | # MAGIC #### Libraries Used
171 | # MAGIC
172 | # MAGIC - `mlflow.tracking.client`: Required for the `MlflowClient` class which is used to interact with the MLflow tracking server.
173 | # MAGIC
174 | # MAGIC ---
175 | # MAGIC
176 | 
177 | # COMMAND ----------
178 | 
179 | # Import the MlflowClient class from the mlflow.tracking.client module
180 | from mlflow.tracking.client import MlflowClient
181 | 
182 | # Define a function to get the latest version of a given model
183 | def get_latest_model_version(model_name: str):
184 |   # Instantiate an MlflowClient object
185 |   client = MlflowClient()
186 | 
187 |   # Retrieve the latest versions of the specified model
188 |   models = client.get_latest_versions(model_name)
189 | 
190 |   # Iterate through the returned models
191 |   new_model_version = None
192 |   for m in models:
193 |     # Extract and store the version number of the model
194 |     new_model_version = m.version
195 | 
196 |   # Return the latest version number
197 |   return new_model_version
198 | 
199 | # COMMAND ----------
200 | 
201 | # MAGIC %md
202 | # MAGIC
203 | # MAGIC # Code Documentation for Model Endpoint Configuration
204 | # MAGIC
205 | # MAGIC This document provides an in-depth overview of the Python code that constructs a JSON configuration for creating or updating a model serving endpoint.
206 | # MAGIC
207 | # MAGIC ---
208 | # MAGIC
209 | # MAGIC ## Code Sections
210 | # MAGIC
211 | # MAGIC ### 1. Import Required Libraries
212 | # MAGIC
213 | # MAGIC #### Purpose
214 | # MAGIC
215 | # MAGIC - Import the Python `requests` library for HTTP requests.
216 | # MAGIC
217 | # MAGIC #### Code Explanation
218 | # MAGIC
219 | # MAGIC - `import requests`
220 | # MAGIC
221 | # MAGIC #### Libraries Used
222 | # MAGIC
223 | # MAGIC - `requests`: Python library for HTTP operations.
224 | # MAGIC
225 | # MAGIC ---
226 | # MAGIC
227 | # MAGIC ### 2. Define JSON Configuration for Model Endpoint
228 | # MAGIC
229 | # MAGIC #### Purpose
230 | # MAGIC
231 | # MAGIC - Creates a JSON object that holds the configuration for the model serving endpoint.
232 | # MAGIC
233 | # MAGIC #### Code Explanation
234 | # MAGIC
235 | # MAGIC 1. `"name": model_serving_endpoint_name`: Specifies the name of the model serving endpoint.
236 | # MAGIC 2. `"config": {...}`: Holds the configuration details for the model serving endpoint.
237 | # MAGIC 3. `"served_models": [...]`: A list of dictionaries, each representing a model to be served.
238 | # MAGIC     - `"model_name": model_name`: The name of the model.
239 | # MAGIC     - `"model_version": get_latest_model_version(model_name=model_name)`: Calls a function to get the latest version of the specified model.
240 | # MAGIC     - `"workload_size": "Small"`: Sets the workload size to "Small".
241 | # MAGIC     - `"scale_to_zero_enabled": True`: Enables the endpoint to scale to zero instances when not in use.
242 | # MAGIC
243 | # MAGIC #### Libraries Used
244 | # MAGIC
245 | # MAGIC - None.
246 | # MAGIC
247 | # MAGIC #### Dependencies
248 | # MAGIC
249 | # MAGIC - `model_serving_endpoint_name`: Variable holding the endpoint name.
250 | # MAGIC - `model_name`: Variable holding the model name.
251 | # MAGIC - `get_latest_model_version()`: Function that retrieves the latest model version.
252 | # MAGIC
253 | # MAGIC #### JSON Structure
254 | # MAGIC
255 | # MAGIC ```json
256 | # MAGIC {
257 | # MAGIC   "name": model_serving_endpoint_name,
258 | # MAGIC   "config": {
259 | # MAGIC     "served_models": [
260 | # MAGIC       {
261 | # MAGIC         "model_name": model_name,
262 | # MAGIC         "model_version": get_latest_model_version(model_name=model_name),
263 | # MAGIC         "workload_size": "Small",
264 | # MAGIC         "scale_to_zero_enabled": True
265 | # MAGIC       }
266 | # MAGIC     ]
267 | # MAGIC   }
268 | # MAGIC }
269 | # MAGIC ```
270 | # MAGIC
271 | 
272 | # COMMAND ----------
273 | 
274 | import requests
275 |  
276 | my_json = {
277 |   "name": model_serving_endpoint_name,
278 |   "config": {
279 |    "served_models": [{
280 |      "model_name": model_name,
281 |      "model_version": get_latest_model_version(model_name=model_name),
282 |      "workload_size": "Small",
283 |      "scale_to_zero_enabled": True
284 |    }]
285 |  }
286 | }
287 | 
288 | # COMMAND ----------
289 | 
290 | my_json
291 | 
292 | # COMMAND ----------
293 | 
294 | # MAGIC %md
295 | # MAGIC ---
296 | # MAGIC
297 | # MAGIC # Code Documentation for Model Serving Endpoint Functions
298 | # MAGIC
299 | # MAGIC This document provides an overview of two Python functions—`func_create_endpoint` and `func_delete_model_serving_endpoint`—used for managing model serving endpoints.
300 | # MAGIC
301 | # MAGIC ## Function Overview
302 | # MAGIC
303 | # MAGIC ### `func_create_endpoint`
304 | # MAGIC
305 | # MAGIC This function either creates a new model serving endpoint or updates an existing one based on the provided parameters.
306 | # MAGIC
307 | # MAGIC ### `func_delete_model_serving_endpoint`
308 | # MAGIC
309 | # MAGIC This function deletes an existing model serving endpoint based on its name.
310 | # MAGIC
311 | # MAGIC ---
312 | # MAGIC
313 | # MAGIC ## Detailed Function Descriptions
314 | # MAGIC
315 | # MAGIC ### Function: `func_create_endpoint`
316 | # MAGIC
317 | # MAGIC #### Purpose
318 | # MAGIC
319 | # MAGIC - Creates or updates the model serving endpoint.
320 | # MAGIC
321 | # MAGIC #### Parameters
322 | # MAGIC
323 | # MAGIC - `model_serving_endpoint_name`: Name of the model serving endpoint.
324 | # MAGIC - `instance`: API instance URL.
325 | # MAGIC - `headers`: HTTP headers for API requests.
326 | # MAGIC - `my_json`: JSON configuration for the model serving endpoint.
327 | # MAGIC
328 | # MAGIC #### Process
329 | # MAGIC
330 | # MAGIC 1. **Define Endpoint URL**: Composes the URL where the endpoint is or will be hosted.
331 | # MAGIC 2. **Check for Existing Endpoint**: Makes an HTTP GET request to check if the endpoint already exists.
332 | # MAGIC 3. **Create or Update Endpoint**: 
333 | # MAGIC    - If the endpoint does not exist, it creates a new one with the specified configuration.
334 | # MAGIC    - If the endpoint does exist, it updates the configuration.
335 | # MAGIC 4. **Poll for Configuration Activation**: Waits until the new configuration is active. Stops waiting after a pre-defined time (10 minutes).
336 | # MAGIC 5. **Status Code Verification**: Checks that the API call was successful.
337 | # MAGIC
338 | # MAGIC #### Libraries Used
339 | # MAGIC
340 | # MAGIC - `requests`: For making HTTP calls.
341 | # MAGIC - `time`: For adding sleep functionality.
342 | # MAGIC - `json`: For JSON parsing.
343 | # MAGIC - `assertpy`: For assertions.
344 | # MAGIC
345 | # MAGIC ### Function: `func_delete_model_serving_endpoint`
346 | # MAGIC
347 | # MAGIC #### Purpose
348 | # MAGIC
349 | # MAGIC - Deletes an existing model serving endpoint.
350 | # MAGIC
351 | # MAGIC #### Parameters
352 | # MAGIC
353 | # MAGIC - `model_serving_endpoint_name`: Name of the model serving endpoint.
354 | # MAGIC - `instance`: API instance URL.
355 | # MAGIC - `headers`: HTTP headers for API requests.
356 | # MAGIC
357 | # MAGIC #### Process
358 | # MAGIC
359 | # MAGIC 1. **Define Endpoint URL**: Composes the URL where the endpoint is hosted.
360 | # MAGIC 2. **Delete Endpoint**: Makes an HTTP DELETE request to remove the endpoint.
361 | # MAGIC 3. **Status Verification**: Checks if the deletion was successful and raises an exception if it fails.
362 | # MAGIC
363 | # MAGIC #### Libraries Used
364 | # MAGIC
365 | # MAGIC - `requests`: For making HTTP calls.
366 | # MAGIC
367 | # MAGIC ---
368 | # MAGIC
369 | # MAGIC This should give a detailed explanation of what each function is doing and how it accomplishes its goals.
370 | 
371 | # COMMAND ----------
372 | 
373 | import requests
374 | import time
375 | import json
376 | import assertpy
377 | 
378 | def func_create_endpoint(model_serving_endpoint_name, instance, headers, my_json):
379 |     """
380 |     Create or update the model serving endpoint.
381 |     """
382 | 
383 |     # Define the endpoint URL
384 |     endpoint_url = f"https://{instance}/api/2.0/serving-endpoints"
385 |     url = f"{endpoint_url}/{model_serving_endpoint_name}"
386 | 
387 |     # Check if the endpoint already exists
388 |     r = requests.get(url, headers=headers)
389 |     if "RESOURCE_DOES_NOT_EXIST" in r.text:
390 |         print(f"Creating new endpoint: ",f"https://{instance}/serving-endpoints/{model_serving_endpoint_name}/invocations")
391 |         re = requests.post(headers=headers, url=endpoint_url, json=my_json)
392 |     else:
393 |         # Extract the new model version from the JSON configuration
394 |         new_model_version = my_json['config']['served_models'][0]['model_version']
395 |         print(f"This endpoint existed previously! Updating it to new model version: {new_model_version}")
396 | 
397 |         # Update endpoint with new config
398 |         url = f"{endpoint_url}/{model_serving_endpoint_name}/config"
399 |         re = requests.put(url, headers=headers, json=my_json['config'])
400 | 
401 |         # Poll until the new configuration is active
402 |         total_wait = 0
403 |         while True:
404 |             r = requests.get(url, headers=headers)
405 |             assertpy.assert_that(r.status_code).is_equal_to(200)
406 | 
407 |             endpoint = json.loads(r.text)
408 |             if "pending_config" in endpoint.keys():
409 |                 seconds = 10
410 |                 print("New config still pending")
411 |                 if total_wait < 600:  # 10 minutes
412 |                     print(f"Waiting for {seconds} seconds. Total wait time: {total_wait} seconds.")
413 |                     time.sleep(seconds)
414 |                     total_wait += seconds
415 |                 else:
416 |                     print(f"Stopping after {total_wait} seconds of waiting.")
417 |                     break
418 |             else:
419 |                 print("New config in place now!")
420 |                 break
421 |     # Check the response code
422 |     assertpy.assert_that(re.status_code).is_equal_to(200)
423 | 
424 | def func_delete_model_serving_endpoint(model_serving_endpoint_name, instance, headers):
425 |     """
426 |     Delete the model serving endpoint.
427 |     """
428 | 
429 |     # Define the endpoint URL
430 |     endpoint_url = f"https://{instance}/ajax-api/2.0/serving-endpoints"
431 |     url = f"{endpoint_url}/{model_serving_endpoint_name}"
432 | 
433 |     # Delete the endpoint
434 |     response = requests.delete(url, headers=headers)
435 | 
436 |     if response.status_code != 200:
437 |         raise Exception(f"Request failed with status {response.status_code}, {response.text}")
438 |     else:
439 |         print(f"{model_serving_endpoint_name} endpoint is deleted!")
440 | 
441 | 
442 | # COMMAND ----------
443 | 
444 | func_create_endpoint(model_serving_endpoint_name, instance, headers, my_json)
445 | 


--------------------------------------------------------------------------------
/Chapter-09/util/training.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | from delta.tables import DeltaTable
  3 | import tempfile
  4 | import os
  5 | import logging
  6 | import shutil
  7 | from pathlib import Path
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pyspark
 12 | import pyspark.sql.functions as F
 13 | 
 14 | import math
 15 | from sklearn.pipeline import Pipeline
 16 | from sklearn.compose import ColumnTransformer
 17 | from sklearn.impute import SimpleImputer
 18 | from sklearn.preprocessing import OneHotEncoder
 19 | from sklearn.compose import make_column_selector as selector
 20 | from sklearn.model_selection import train_test_split
 21 | from sklearn.ensemble import RandomForestRegressor
 22 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 23 | 
 24 | 
 25 | import mlflow
 26 | from mlflow.tracking import MlflowClient
 27 | from mlflow.exceptions import RestException
 28 | from mlflow.models.signature import ModelSignature
 29 | from mlflow.types.schema import Schema, ColSpec, DataType
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | #mlflow util functions to manage models
 34 | def transition_model(model_version, stage):
 35 |     """
 36 |     Transition a model to a specified stage in MLflow Model Registry using the associated 
 37 |     mlflow.entities.model_registry.ModelVersion object.
 38 | 
 39 |     Args:
 40 |         model_version: mlflow.entities.model_registry.ModelVersion. ModelVersion object to transition
 41 |         stage: (str) New desired stage for this model version. One of "Staging", "Production", "Archived" or "None"
 42 | 
 43 |     Returns:
 44 |         A single mlflow.entities.model_registry.ModelVersion object
 45 |     """
 46 |     client = MlflowClient()
 47 | 
 48 |     # Check if the stage is valid
 49 |     if stage not in ["Staging", "Production", "Archived", "None"]:
 50 |         raise ValueError(f"Invalid stage: {stage}")
 51 | 
 52 |     # Transition the model version
 53 |     model_version = client.transition_model_version_stage(
 54 |         name=model_version.name,
 55 |         version=model_version.version,
 56 |         stage=stage,
 57 |         archive_existing_versions=True,
 58 |     )
 59 | 
 60 |     return model_version
 61 | 
 62 | 
 63 | def fetch_model_version(registry_model_name, stage="Staging"):
 64 |     """
 65 |     For a given registered model, return the MLflow ModelVersion object
 66 |     This contains all metadata needed, such as params logged etc
 67 | 
 68 |     Args:
 69 |         registry_model_name: (str) Name of MLflow Registry Model
 70 |         stage: (str) Stage for this model. One of "Staging" or "Production"
 71 | 
 72 |     Returns:
 73 |         mlflow.entities.model_registry.ModelVersion
 74 |     """
 75 |     client = MlflowClient()
 76 |     filter_string = f'name="{registry_model_name}"'
 77 |     registered_model = client.search_registered_models(filter_string=filter_string)[0]
 78 | 
 79 |     # Check if the stage is valid
 80 |     if stage not in ["Staging", "Production"]:
 81 |         raise ValueError(f"Invalid stage: {stage}")
 82 | 
 83 |     # Get the latest model version in the desired stage
 84 |     model_version = next(
 85 |         (model_version for model_version in registered_model.latest_versions if model_version.current_stage == stage),
 86 |         None
 87 |     )
 88 | 
 89 |     return model_version
 90 | 
 91 | 
 92 | def get_run_from_registered_model(registry_model_name, stage="Staging"):
 93 |     """
 94 |     Get Mlflow run object from registered model
 95 | 
 96 |     Args:
 97 |         registry_model_name: (str) Name of MLflow Registry Model
 98 |         stage: (str) Stage for this model. One of "Staging" or "Production"
 99 | 
100 |     Returns:
101 |         mlflow.entities.run.Run
102 |     """
103 |     client = MlflowClient()
104 |     filter_string = f'name="{registry_model_name}"'
105 |     registered_model = client.search_registered_models(filter_string=filter_string)[0]
106 | 
107 |     # Check if the stage is valid
108 |     if stage not in ["Staging", "Production"]:
109 |         raise ValueError(f"Invalid stage: {stage}")
110 | 
111 |     # Get the latest model version in the desired stage
112 |     model_version = next(
113 |         (model_version for model_version in registered_model.latest_versions if model_version.current_stage == stage),
114 |         None
115 |     )
116 | 
117 |     if model_version is None:
118 |         raise ValueError(f"No model version found in stage {stage} for model {registry_model_name}")
119 | 
120 |     run_id = model_version.run_id
121 |     run = mlflow.get_run(run_id)
122 | 
123 |     return run
124 | 
125 | 
126 | def cleanup_registered_model(registry_model_name: str) -> None:
127 |     """
128 |     Deletes a registered model in MLflow model registry.
129 | 
130 |     To delete a model in the model registry, all model versions must first be archived.
131 |     This function first archives all versions of a model in the registry, and then deletes the model.
132 | 
133 |     Args:
134 |         registry_model_name: The name of the model in the MLflow model registry.
135 |     """
136 |     client = MlflowClient()
137 | 
138 |     filter_string = f'name="{registry_model_name}"'
139 | 
140 |     model_versions = client.search_model_versions(filter_string=filter_string)
141 | 
142 |     if len(model_versions) == 0:
143 |         logging.info("No registered models to delete")
144 |         return
145 | 
146 |     logging.info(f"Deleting following registered model: {registry_model_name}")
147 | 
148 |     # Move any versions of the model to Archived
149 |     for model_version in model_versions:
150 |         try:
151 |             if model_version.current_stage!='Archived':
152 |                 client.transition_model_version_stage(
153 |                     name=model_version.name,
154 |                     version=model_version.version,
155 |                     stage="Archived",
156 |                 )
157 |         except Exception as e:
158 |             logging.exception(f"Error archiving version {model_version.version} of model {registry_model_name}")
159 |             raise
160 | 
161 |     try:
162 |         client.delete_registered_model(registry_model_name)
163 |     except RestException as e:
164 |         logging.exception(f"Error deleting registered model {registry_model_name}")
165 |         raise
166 | 
167 | 
168 | 
169 | # COMMAND ----------
170 | 
171 | #delete any registered model from registry
172 | cleanup_registered_model(mlflow_experiment_name)
173 | 
174 | # COMMAND ----------
175 | 
176 | 
177 | #delta table utility functions
178 | def get_delta_version(delta_path: str) -> int:
179 |     """
180 |     Gets the latest version of a Delta table given the path to the table.
181 | 
182 |     Args:
183 |         delta_path: The path to the Delta table
184 | 
185 |     Returns:
186 |         The version of the Delta table.
187 |     """
188 |     try:
189 |         delta_table = DeltaTable.forPath(spark, delta_path)
190 |         delta_history= delta_table.history()
191 | 
192 |          # Retrieve the lastest Delta version - this is the version loaded when reading from delta_path
193 |         delta_version = delta_history.first()["version"]
194 |   
195 |         return delta_version
196 | 
197 |     except AnalysisException as e:
198 |         raise ValueError(f"Error getting Delta table version: {e}")
199 | 
200 | def load_delta_table_from_run(run: mlflow.entities.run.Run) -> pyspark.sql.DataFrame:
201 |     """
202 |     Given an MLflow run, load the Delta table which was used for that run,
203 |     using the path and version tracked at tracking time.
204 | 
205 |     Note that by default Delta tables only retain a commit history for 30 days, meaning
206 |     that previous versions older than 30 days will be deleted by default. This property can
207 |     be updated using the Delta table property `delta.logRetentionDuration`.
208 | 
209 |     For more information, see https://docs.databricks.com/delta/delta-batch.html#data-retention
210 | 
211 |     Args:
212 |         run: The MLflow run object.
213 | 
214 |     Returns:
215 |         The Spark DataFrame for the Delta table used in the run.
216 |     """
217 |     delta_path = run.data.params.get("delta_path")
218 |     delta_version = run.data.params.get("delta_version")
219 |     if not delta_path or not delta_version:
220 |         raise ValueError("Error: missing delta_path or delta_version parameters.")
221 |     print(f"Loading Delta table from path: {delta_path}; version: {delta_version}")
222 |     try:
223 |         df = spark.read.format("delta").option("versionAsOf", delta_version).load(delta_path)
224 |         return df
225 |     except Exception as e:
226 |         print(f"Error: could not load Delta table. {str(e)}")
227 |         raise
228 | 
229 | # COMMAND ----------
230 | 
231 | def calculate_summary_stats(pdf: pd.DataFrame) -> pd.DataFrame:
232 |   """
233 |   Create a pandas DataFrame of summary statistics for a provided pandas DataFrame.
234 |   Involved calling .describe on pandas DataFrame provided and additionally add
235 |   median values and a count of null values for each column.
236 |   
237 |   :param pdf: pandas DataFrame
238 |   :return: pandas DataFrame of sumary statistics for each column
239 |   """
240 |   stats_pdf = pdf.describe(include="all")
241 | 
242 |   # Add median values row
243 |   median_vals = pdf.median()
244 |   stats_pdf.loc["median"] = median_vals
245 | 
246 |   # Add null values row
247 |   null_count = pdf.isna().sum()
248 |   stats_pdf.loc["null_count"] = null_count
249 | 
250 |   return stats_pdf
251 | 
252 | 
253 | def log_summary_stats_pdf_as_csv(pdf: pd.DataFrame) -> None:
254 |     """
255 |     Log summary statistics pandas DataFrame as a csv file to MLflow as an artifact.
256 | 
257 |     Args:
258 |         pdf: A pandas DataFrame containing summary statistics.
259 |     """
260 |     with tempfile.NamedTemporaryFile(prefix="summary_stats", suffix=".csv", delete=False) as temp:
261 |         pdf.to_csv(temp.name, index=True)
262 |         artifact_name = "summary_stats.csv"
263 |         shutil.move(temp.name, artifact_name)
264 |         mlflow.log_artifact(artifact_name, artifact_path="summary_stats")
265 |         os.remove(artifact_name)
266 | 
267 | 
268 | def load_summary_stats_pdf_from_run(run: mlflow.entities.run.Run, local_tmp_dir: str) -> pd.DataFrame:
269 |     """
270 |     Given an MLflow run, download the summary stats csv artifact to a local_tmp_dir and load the
271 |     csv into a pandas DataFrame.
272 | 
273 |     Args:
274 |         run: The MLflow run object.
275 |         local_tmp_dir: (str) path to a local filesystem tmp directory
276 | 
277 |     Returns:
278 |         A pandas DataFrame containing statistics computed during training.
279 |     """
280 | 
281 |     # Use mlflow to download the csv file logged in the artifacts of a run to a local tmp path
282 |     Path(local_tmp_dir).mkdir(parents=True, exist_ok=True)
283 |     local_path=mlflow.artifacts.download_artifacts(run_id=run.info.run_id, artifact_path="summary_stats", dst_path=local_tmp_dir)
284 |     print(f"Summary stats artifact downloaded in: {local_path}")
285 | 
286 |     # Load the csv into a pandas DataFrame
287 |     summary_stats_path = os.path.join(local_path, os.listdir(local_path)[0])
288 |     try:
289 |         summary_stats_pdf = pd.read_csv(summary_stats_path, index_col="Unnamed: 0")
290 |     except Exception as e:
291 |         raise ValueError(f"Failed to load summary stats csv from path {summary_stats_path}: {e}")
292 | 
293 |     return summary_stats_pdf        
294 | 
295 | # COMMAND ----------
296 | 
297 | def create_sklearn_rf_pipeline(model_params, seed=42):
298 |     """
299 |     Create the sklearn pipeline required for the RandomForestRegressor.
300 |     We compose two components of the pipeline separately - one for numeric cols, one for categorical cols
301 |     These are then combined with the final RandomForestRegressor stage, which uses the model_params dict
302 |     provided via the args. The unfitted pipeline is returned.
303 | 
304 |     For a robust pipeline in practice, one should also have a pipeline stage to add indicator columns for those features
305 |     which have been imputed. This can be useful to encode information about those instances which have been imputed with
306 |     a given value. We refrain from doing so here to simplify the pipeline, and focus on the overall workflow.
307 | 
308 |     Args:
309 |         model_params: (dict) Dictionary of model parameters to pass into sklearn RandomForestRegressor
310 |         seed : (int) Random seed to set via random_state arg in RandomForestRegressor
311 | 
312 |     Returns:
313 |         sklearn pipeline
314 |     """
315 |     # Create pipeline component for numeric Features
316 |     numeric_transformer = Pipeline(steps=[
317 |         ("imputer", SimpleImputer(strategy='median'))])
318 | 
319 |     # Create pipeline component for categorical Features
320 |     categorical_transformer = Pipeline(steps=[
321 |         ("imputer", SimpleImputer(strategy="most_frequent")),
322 |         ("ohe", OneHotEncoder(handle_unknown="ignore"))])
323 | 
324 |     # Combine numeric and categorical components into one preprocessor pipeline
325 |     # Use ColumnTransformer to apply the different preprocessing pipelines to different subsets of features
326 |     # Use selector (make_column_selector) to select which subset of features to apply pipeline to
327 |     preprocessor = ColumnTransformer(transformers=[
328 |         ("numeric", numeric_transformer, selector(dtype_exclude="category")),
329 |         ("categorical", categorical_transformer, selector(dtype_include="category"))
330 |     ])
331 | 
332 |     pipeline = Pipeline(steps=[
333 |         ("preprocessor", preprocessor),
334 |         ("rf", RandomForestRegressor(random_state=seed, **model_params))
335 |     ])
336 | 
337 |     return pipeline
338 | 
339 | def train_sklearn_rf_model(run_name, delta_path, model_params, misc_params, seed=42):
340 |   """
341 |   Function to trigger training and evaluation of an sklearn RandomForestRegressor model.
342 | 
343 |   Parameters, metrics, and artifacts are logged to MLflow during this process.
344 | 
345 |   Returns the MLflow run object.
346 | 
347 |   Args:
348 |     run_name: (str) Name to give to MLflow run.
349 |     delta_path: (str) Path to Delta table to use as input data.
350 |     model_params: (dict) Dictionary of model parameters to pass into sklearn RandomForestRegressor.
351 |     misc_params: (dict) Dictionary of parameters to use.
352 |     seed: (int) Random seed.
353 | 
354 |   Returns:
355 |     mlflow.entities.run.Run
356 |   """
357 | 
358 |   #end any active run 
359 |   mlflow.end_run()
360 |   
361 |   # Enable MLflow autologging.
362 |   mlflow.autolog(log_input_examples=True, silent=True)
363 | 
364 |   # Load Delta table from `delta_path`.
365 |   df = spark.read.format("delta").load(delta_path)
366 | 
367 |   # Log `delta_path` and version.
368 |   mlflow.log_param("delta_path", delta_path)
369 |   delta_version = get_delta_version(delta_path)
370 |   mlflow.log_param("delta_version", delta_version)
371 | 
372 |   # Track misc parameters used in pipeline creation (preprocessing) as JSON artifact.
373 |   mlflow.log_dict(misc_params, "preprocessing_params.json")
374 | 
375 |   # Convert Spark DataFrame to pandas, as we will be training an sklearn model.
376 |   pdf = df.toPandas()
377 | 
378 |   # Convert all categorical columns to category dtype.
379 |   for c in misc_params["cat_cols"]:
380 |     pdf[c] = pdf[c].astype("category")
381 | 
382 |   #keek only the required columns
383 |   cols_to_keep = np.concatenate(([misc_params['target_col']], misc_params['cat_cols'], misc_params['num_cols']), axis=None)
384 |   pdf = pdf[cols_to_keep]
385 | 
386 | 
387 |   # Create summary statistics pandas DataFrame and log as a CSV to MLflow.
388 |   summary_stats_pdf = calculate_summary_stats(pdf[cols_to_keep])
389 |   log_summary_stats_pdf_as_csv(summary_stats_pdf)
390 | 
391 |   # Track number of total instances and "month".
392 |   num_instances = pdf.shape[0]
393 |   mlflow.log_param("num_instances", num_instances)  # Log number of instances.
394 |   mlflow.log_param("month", misc_params["month"])   # Log month number.
395 | 
396 |   # Split data.
397 |   X = pdf.drop([misc_params["target_col"]], axis=1)
398 |   y = pdf[misc_params["target_col"]]
399 |   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
400 | 
401 |   # Track train/test data info as parameters.
402 |   num_training = X_train.shape[0]
403 |   mlflow.log_param("num_training_instances", num_training)
404 |   num_test = X_test.shape[0]
405 |   mlflow.log_param("num_test_instances", num_test)
406 | 
407 |   # Fit sklearn pipeline with RandomForestRegressor model.
408 |   rf_pipeline = create_sklearn_rf_pipeline(model_params)
409 |   rf_pipeline.fit(X_train, y_train)
410 | 
411 |   # Make predictions on the test data
412 |   y_pred = rf_pipeline.predict(X_test)
413 |   
414 |   # Calculate evaluation metrics on the test data
415 |   mae = mean_absolute_error(y_test, y_pred)
416 |   mse = mean_squared_error(y_test, y_pred)
417 |   r2 = r2_score(y_test, y_pred)
418 |   rmse = math.sqrt(mse)
419 | 
420 |   # Specify data schema which the model will use as its ModelSignature.
421 | 
422 |   input_schema = Schema([
423 |         ColSpec(name="Weather_Condition", type=DataType.string),
424 |         ColSpec(name="Promotion_Type", type=DataType.string),
425 |         ColSpec(name="Device_Type", type=DataType.string),
426 |         ColSpec(name="Temperature", type=DataType.float),
427 |         ColSpec(name="Website_Traffic", type=DataType.integer)
428 |    ])
429 | 
430 |   output_schema = Schema([ColSpec("integer")])
431 |   signature = ModelSignature(input_schema, output_schema)
432 |   mlflow.sklearn.log_model(rf_pipeline, "model", signature=signature)
433 | 
434 |   return mlflow.active_run()
435 | 
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/Chapter-09/util/monitoring.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC #### Monitoring Utility Functions
  5 | # MAGIC
  6 | # MAGIC The following functions check
  7 | # MAGIC - the proportion of nulls
  8 | # MAGIC - the differences in summary statistics
  9 | # MAGIC - the shifts in distributions
 10 | 
 11 | # COMMAND ----------
 12 | 
 13 | 
 14 | from scipy import stats
 15 | import matplotlib.pyplot as plt
 16 | import seaborn as sns
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | def check_null_proportion(new_pdf, null_proportion_threshold):
 21 |   """
 22 |   Function to compute the proportions of nulls for all columns in a Spark DataFrame and return any features that exceed the specified null threshold.
 23 | 
 24 |   Args:
 25 |     df: (pd.DataFrame) The DataFrame that contains new incoming data.
 26 |     null_proportion_threshold: (float) A numeric value ranging from 0 and 1 that specifies the tolerable fraction of nulls.
 27 | 
 28 |   Returns:
 29 |     A dictionary mapping feature names to their null proportions.
 30 | 
 31 |   Raises:
 32 |     ValueError: If the null proportion threshold is not between 0 and 1.
 33 | 
 34 |     Notes:
 35 |       * This function uses the `isnull()` method to identify null values in the DataFrame.
 36 |       * The `sum()` method is used to count the number of null values in each column.
 37 |       * The `len()` method is used to get the total number of rows in the DataFrame.
 38 |       * The `transpose()` method is used to convert the DataFrame from a long format to a wide format.
 39 |       * The `assert` statement is used to check that the null proportion threshold is between 0 and 1.
 40 |       * The `print()` statement is used to print an alert if there are any features that exceed the null proportion threshold.
 41 |   """
 42 | 
 43 |   # Check that the null proportion threshold is between 0 and 1.
 44 |   if null_proportion_threshold < 0 or null_proportion_threshold > 1:
 45 |     raise ValueError(
 46 |         "The null proportion threshold must be between 0 and 1. "
 47 |         f"Received: {null_proportion_threshold}"
 48 |     )
 49 | 
 50 |   # Compute the proportions of nulls for all columns in the DataFrame.
 51 |   missing_stats = pd.DataFrame(new_pdf.isnull().sum() / len(new_pdf)).transpose()
 52 | 
 53 |   # Get a list of the column names that exceed the null proportion threshold.
 54 |   null_col_list = missing_stats.columns[(missing_stats >= null_proportion_threshold).iloc[0]]
 55 | 
 56 |   # Create a dictionary mapping feature names to their null proportions.
 57 |   null_dict = {}
 58 |   for feature in null_col_list:
 59 |     null_dict[feature] = missing_stats[feature][0]
 60 | 
 61 |   # Check if any features exceed the null proportion threshold.
 62 |   if len(null_dict) > 0:
 63 |     print("Alert: There are feature(s) that exceed(s) the expected null threshold. Please ensure that the data is ingested correctly")
 64 |     print(null_dict)
 65 | 
 66 |   # Return the dictionary of null proportions.
 67 |   return null_dict
 68 | 
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | def check_diff_in_summary_stats(new_stats_pdf, prod_stats_pdf, num_cols, stats_threshold_limit, statistic_list):
 73 |   """
 74 |   Function to check if the new summary stats significantly deviates from the summary stats in the production data by a certain threshold.
 75 | 
 76 |   Args:
 77 |     new_stats_pdf: (pd.DataFrame) summary statistics of incoming data
 78 |     prod_stats_pdf: (pd.DataFrame) summary statistics of production data
 79 |     num_cols: (list) a list of numeric columns
 80 |     stats_threshold_limit: (float) a float < 1 that signifies the threshold limit
 81 |     statistic_list: (list) a list of statistics, e.g., mean, std, min, max
 82 | 
 83 |   Returns:
 84 |     A list of feature names that significantly deviate from the production data.
 85 | 
 86 |   Raises:
 87 |     ValueError: If the stats_threshold_limit is not between 0 and 1.
 88 | 
 89 |   Notes:
 90 |     * This function uses the `loc` method to get the value of a specific statistic for a given feature.
 91 |     * The `round` method is used to round a number to a specified number of decimal places.
 92 |     * The `print` statement is used to print the results of the function.
 93 |   """
 94 | 
 95 |   # Check that the stats_threshold_limit is between 0 and 1.
 96 |   if stats_threshold_limit < 0 or stats_threshold_limit > 1:
 97 |     raise ValueError(
 98 |         "The stats_threshold_limit must be between 0 and 1. "
 99 |         f"Received: {stats_threshold_limit}"
100 |     )
101 | 
102 |   # Create a list of feature names that significantly deviate from the production data.
103 |   feature_diff_list = []
104 | 
105 |   # Iterate over the numeric columns.
106 |   for feature in num_cols:
107 | 
108 |     # Print a message indicating that the feature is being checked.
109 |     print(f"\nCHECKING {feature}.........")
110 | 
111 |     # Iterate over the statistics.
112 |     for statistic in statistic_list:
113 | 
114 |       # Get the value of the statistic for the feature in the production data.
115 |       prod_stat_value = prod_stats_pdf[[str(feature)]].loc[str(statistic)][0]
116 | 
117 |       # Calculate the upper and lower threshold limits for the statistic.
118 |       upper_val_limit = prod_stat_value * (1 + stats_threshold_limit)
119 |       lower_val_limit = prod_stat_value * (1 - stats_threshold_limit)
120 | 
121 |       # Get the value of the statistic for the feature in the new data.
122 |       new_stat_value = new_stats_pdf[[str(feature)]].loc[str(statistic)][0]
123 | 
124 |       # Check if the new statistic value is outside of the threshold limits.
125 |       if new_stat_value < lower_val_limit:
126 |         feature_diff_list.append(str(feature))
127 |         print(f"\tThe {statistic} {feature} in the new data is at least {stats_threshold_limit * 100}% lower than the {statistic} in the production data. Decreased from {round(prod_stat_value, 2)} to {round(new_stat_value, 2)}.")
128 | 
129 |       elif new_stat_value > upper_val_limit:
130 |         feature_diff_list.append(str(feature))
131 |         print(f"\tThe {statistic} {feature} in the new data is at least {stats_threshold_limit * 100}% higher than the {statistic} in the production data. Increased from {round(prod_stat_value, 2)} to {round(new_stat_value, 2)}.")
132 | 
133 |   # Return the list of feature names that significantly deviate from the production data.
134 |   return np.unique(feature_diff_list)
135 | 
136 | 
137 | # COMMAND ----------
138 | 
139 | def check_diff_in_variances(reference_df, new_df, num_cols, p_threshold):
140 |   """
141 |   Function to check if the variances of the numeric columns in `new_df` are significantly different from the variances of the corresponding columns in `reference_df`.
142 | 
143 |   Args:
144 |     reference_df: (pd.DataFrame) The DataFrame that contains the production data.
145 |     new_df: (pd.DataFrame) The DataFrame that contains the new data.
146 |     num_cols: (list) A list of the names of the numeric columns.
147 |     p_threshold: (float) The p-value threshold for significance.
148 | 
149 |   Returns:
150 |     A dictionary mapping feature names to their p-values.
151 | 
152 |   Raises:
153 |     ValueError: If `p_threshold` is not between 0 and 1.
154 | 
155 |   Notes:
156 |     * This function uses the `levene()` function from the `scipy.stats` module to perform the Levene test.
157 |     * The `assert` statement is used to check that `p_threshold` is between 0 and 1.
158 |     * The `print()` statements are used to print the results of the function.
159 |   """
160 | 
161 |   # Check that `p_threshold` is between 0 and 1.
162 |   if p_threshold < 0 or p_threshold > 1:
163 |     raise ValueError(
164 |         "The p_threshold must be between 0 and 1. "
165 |         f"Received: {p_threshold}"
166 |     )
167 | 
168 |   # Create a dictionary mapping feature names to their p-values.
169 |   var_dict = {}
170 | 
171 |   # Iterate over the numeric columns.
172 |   for feature in num_cols:
173 | 
174 |     # Perform the Levene test.
175 |     levene_stat, levene_pval = stats.levene(reference_df[feature], new_df[feature], center="median")
176 | 
177 |     # If the p-value is less than or equal to the threshold, then the variances are significantly different.
178 |     if levene_pval <= p_threshold:
179 |       var_dict[feature] = levene_pval
180 | 
181 |   # Check if any features have significantly different variances.
182 |   if len(var_dict) > 0:
183 |     print(f"The feature(s) below have significantly different variances compared to production data at p-value {p_threshold}")
184 |     print(var_dict)
185 |   else:
186 |     print(f"No features have significantly different variances compared to production data at p-value {p_threshold}")
187 | 
188 |   # Return the dictionary of p-values.
189 |   return var_dict
190 | 
191 | 
192 | # COMMAND ----------
193 | 
194 | def check_dist_ks_bonferroni_test(reference_df, new_df, num_cols, p_threshold, ks_alternative="two-sided"):
195 |   """
196 |   Function to take two pandas DataFrames and compute the Kolmogorov-Smirnov statistic on 2 sample distributions
197 |   where the variable in question is continuous.
198 |   This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous
199 |   distribution. If the KS statistic is small or the p-value is high, then we cannot reject the hypothesis that 
200 |   the distributions of the two samples are the same.
201 |   The alternative hypothesis can be either ‘two-sided’ (default), ‘less’ or ‘greater’.
202 |   This function assumes that the distributions to compare have the same column name in both DataFrames.
203 | 
204 |   Args:
205 |     reference_df: pandas DataFrame containing column with the distribution to be compared
206 |     new_df: pandas DataFrame containing column with the distribution to be compared
207 |     num_cols: (list) A list of the names of the numeric columns.
208 |     p_threshold: (float) The p-value threshold for significance.
209 |     ks_alternative: Defines the alternative hypothesis - ‘two-sided’ (default), ‘less’ or ‘greater’.
210 | 
211 |   Returns:
212 |     A dictionary mapping feature names to their p-values.
213 | 
214 |   Raises:
215 |     ValueError: If `p_threshold` is not between 0 and 1.
216 | 
217 |   Notes:
218 |     * This function uses the `ks_2samp()` function from the `scipy.stats` module to perform the Kolmogorov-Smirnov test.
219 |     * The `assert` statement is used to check that `p_threshold` is between 0 and 1.
220 |     * The `print()` statements are used to print the results of the function.
221 |     * The Bonferroni correction is used to adjust the p-value threshold to account for multiple comparisons.
222 |   """
223 | 
224 |   # Check that `p_threshold` is between 0 and 1.
225 |   if p_threshold < 0 or p_threshold > 1:
226 |     raise ValueError(
227 |         "The p_threshold must be between 0 and 1. "
228 |         f"Received: {p_threshold}"
229 |     )
230 | 
231 |   # Compute the Bonferroni-corrected alpha level.
232 |   corrected_alpha = p_threshold / len(num_cols)
233 | 
234 |   # Create a dictionary mapping feature names to their p-values.
235 |   ks_dict = {}
236 | 
237 |   # Iterate over the numeric columns.
238 |   for feature in num_cols:
239 | 
240 |     # Compute the Kolmogorov-Smirnov statistic and p-value.
241 |     ks_stat, ks_pval = stats.ks_2samp(reference_df[feature], new_df[feature], alternative=ks_alternative, mode="asymp")
242 | 
243 |     # If the p-value is less than or equal to the corrected alpha level, then the distributions are significantly different.
244 |     if ks_pval <= corrected_alpha:
245 |       ks_dict[feature] = ks_pval
246 | 
247 |   # Check if any features have significantly different distributions.
248 |   if len(ks_dict) > 0:
249 |     print(f"The feature(s) below have significantly different distributions compared to production data at Bonferroni-corrected alpha level of {round(corrected_alpha, 4)}, according to the KS test")
250 |     print("\t", ks_dict)
251 |   else:
252 |     print(f"No feature distributions has shifted according to the KS test at the Bonferroni-corrected alpha level of {round(corrected_alpha, 4)}. ")
253 | 
254 |   # Return the dictionary of p-values.
255 |   return ks_dict
256 | 
257 | # COMMAND ----------
258 | 
259 | def check_categorical_diffs(reference_pdf, new_pdf, cat_cols, p_threshold):
260 |   """
261 |   This function checks if there are differences in expected counts for categorical variables between the incoming data and the data in production.
262 | 
263 |   Args:
264 |     reference_pdf: (pandas DataFrame) new incoming data
265 |     new_pdf: (pandas DataFrame) data in production
266 |     cat_cols: (list) a list of categorical columns
267 | 
268 |   Returns:
269 |     A dictionary mapping feature names to their p-values.
270 | 
271 |   Raises:
272 |     ValueError: If `p_threshold` is not between 0 and 1.
273 | 
274 |   Notes:
275 |     * This function uses the `chisquare()` function from the `scipy.stats` module to perform the chi-squared test.
276 |     * The `assert` statement is used to check that `p_threshold` is between 0 and 1.
277 |     * The `print()` statements are used to print the results of the function.
278 |   """
279 | 
280 |   # Check that `p_threshold` is between 0 and 1.
281 |   if p_threshold < 0 or p_threshold > 1:
282 |     raise ValueError(
283 |         "The p_threshold must be between 0 and 1. "
284 |         f"Received: {p_threshold}"
285 |     )
286 | 
287 |   # Create a dictionary mapping feature names to their p-values.
288 |   chi_dict = {}
289 | 
290 |   # Iterate over the categorical columns.
291 |   for feature in cat_cols:
292 | 
293 |     # Calculate the observed frequencies by creating a contingency table using pd.crosstab
294 |     observed_freq = pd.crosstab(reference_pdf[feature], new_pdf[feature])
295 | 
296 |     # Perform the Chi-Square test of independence
297 |     chi2, p_value, _, _ = stats.chi2_contingency(observed_freq)
298 | 
299 |     # If the p-value is less than or equal to the threshold, then the expected counts are significantly different.
300 |     if p_value <= p_threshold:
301 |       chi_dict[feature] = p_value
302 | 
303 |   # Check if any features have significantly different expected counts.
304 |   if len(chi_dict) > 0:
305 |     print(f"The following categorical variables have significantly different expected counts compared to the production data at p-value {p_threshold}:")
306 |     print("\t", chi_dict)
307 |   else:
308 |     print(f"No categorical variables have significantly different expected counts compared to the production data at p-value {p_threshold}.")
309 | 
310 |   return chi_dict
311 | 
312 | # COMMAND ----------
313 | 
314 | def compare_model_perfs(current_staging_run, current_prod_run, min_model_perf_threshold, metric_to_check):
315 |   """
316 |   This function compares the performances of the models in staging and in production.
317 | 
318 |   Args:
319 |     current_staging_run: MLflow run that contains information on the staging model
320 |     current_prod_run: MLflow run that contains information on the production model
321 |     min_model_perf_threshold (float): The minimum threshold that the staging model should exceed before being transitioned to production
322 |     metric_to_check (string): The metric that the user is interested in using to compare model performances
323 | 
324 |   Returns:
325 |     Recommendation to transition the staging model to production or not
326 | 
327 |   Raises:
328 |     ValueError: If `min_model_perf_threshold` is not positive.
329 | 
330 |   Notes:
331 |     * This function uses the `data.metrics` attribute of the MLflow runs to get the metrics for the staging and production models.
332 |     * The `round()` function is used to round the difference in performance to two decimal places.
333 |     * The `print()` statements are used to print the results of the function.
334 |   """
335 | 
336 |   # Check that `min_model_perf_threshold` is positive.
337 |   if min_model_perf_threshold < 0:
338 |     raise ValueError(
339 |         "The min_model_perf_threshold must be positive. "
340 |         f"Received: {min_model_perf_threshold}"
341 |     )
342 | 
343 |   # Calculate the difference in performance between the staging and production models.
344 |   model_diff_fraction = current_staging_run.data.metrics[str(metric_to_check)] / current_prod_run.data.metrics[str(metric_to_check)]
345 |   model_diff_percent = round((model_diff_fraction - 1)*100, 2)
346 | 
347 |   # Print the performance of the staging and production models.
348 |   print(f"Staging run's {metric_to_check}: {round(current_staging_run.data.metrics[str(metric_to_check)],3)}")
349 |   print(f"Current production run's {metric_to_check}: {round(current_prod_run.data.metrics[str(metric_to_check)],3)}")
350 | 
351 |   # Recommend whether to transition the staging model to production.
352 |   if model_diff_percent >= 0 and (model_diff_fraction - 1 >= min_model_perf_threshold):
353 |     print(f"The current staging run exceeds the model improvement threshold of at least +{min_model_perf_threshold}. You may proceed with transitioning the staging model to production now.")
354 |     
355 |   elif model_diff_percent >= 0 and (model_diff_fraction - 1  < min_model_perf_threshold):
356 |     print(f"CAUTION: The current staging run does not meet the improvement threshold of at least +{min_model_perf_threshold}. Transition the staging model to production with caution.")
357 |   else: 
358 |     print(f"ALERT: The current staging run underperforms by {model_diff_percent}% when compared to the production model. Do not transition the staging model to production.")
359 | 
360 | 
361 | # COMMAND ----------
362 | 
363 | def plot_boxplots(unique_feature_diff_array, reference_pdf, new_pdf):
364 |     """
365 |     Plot boxplots comparing the distributions of unique features between incoming data and production data.
366 | 
367 |     Args:
368 |         unique_feature_diff_array (list): List of unique feature names to compare.
369 |         reference_pdf (pandas.DataFrame): Reference production data.
370 |         new_pdf (pandas.DataFrame): New incoming data.
371 | 
372 |     Returns:
373 |         matplotlib.figure.Figure: The generated figure.
374 | 
375 |     Raises:
376 |         None
377 | 
378 |     """
379 |     # Set the theme of the plots.
380 |     sns.set_theme(style="whitegrid")
381 | 
382 |     # Calculate the number of columns.
383 |     num_columns = len(unique_feature_diff_array)
384 | 
385 |     # Create a figure and axes.
386 |     fig, axes = plt.subplots(1, num_columns, figsize=(5*num_columns, 5))
387 | 
388 |     # Set the title of the figure.
389 |     fig.suptitle("Distribution Comparisons between Incoming Data and Production Data")
390 | 
391 |     # Plot boxplots for each column name side by side.
392 |     for i, column_name in enumerate(unique_feature_diff_array):
393 |         ax = axes[i] if num_columns > 1 else axes  # Access the correct subplot.
394 |         ax.boxplot([reference_pdf[column_name], new_pdf[column_name]])
395 |         ax.set_xticklabels(['Production Data', 'New Incoming Data'])
396 |         ax.set_title(column_name)
397 | 
398 |     # Set common y-axis label.
399 |     fig.text(0.04, 0.5, 'Value', va='center', rotation='vertical')
400 | 
401 |     # Set plot title.
402 |     plt.suptitle('Boxplot Comparison')
403 | 
404 |     plt.close()
405 | 
406 |     # Return the generated figure.
407 |     return fig
408 | 
409 | 


--------------------------------------------------------------------------------
/Chapter-06/model-registry-and-webhooks.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC
  3 | # MAGIC %md
  4 | # MAGIC ## Author
  5 | # MAGIC
  6 | # MAGIC - **Debu Sinha**
  7 | # MAGIC
  8 | # MAGIC ## Tested Environment
  9 | # MAGIC
 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 13 | # MAGIC
 14 | # MAGIC ## Cluster Setup Instructions
 15 | # MAGIC
 16 | # MAGIC 1. **Create a Cluster**: 
 17 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 18 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 19 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 20 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 21 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 22 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 23 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 24 | # MAGIC
 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 26 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 27 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 28 | # MAGIC
 29 | # MAGIC ## MLflow Model Registry API
 30 | # MAGIC
 31 | # MAGIC This section demonstrates how to register a model in the registry and request its transition to the staging environment.
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | # MAGIC %md
 36 | # MAGIC ### Retrieving the Most Recently Updated Experiment from the MLflow Server
 37 | # MAGIC
 38 | # MAGIC In this code snippet, several key tasks are carried out:
 39 | # MAGIC
 40 | # MAGIC 1. **Initialize MLflow Client**: 
 41 | # MAGIC    - The MLflow tracking client is initialized to interact with the MLflow server.
 42 | # MAGIC   
 43 | # MAGIC 2. **Fetch Available Experiments**: 
 44 | # MAGIC    - A list of all available experiments is fetched using the `search_experiments()` method of the client.
 45 | # MAGIC   
 46 | # MAGIC 3. **Sort Experiments by Last Update Time**: 
 47 | # MAGIC    - The fetched experiments are sorted based on their last update time in descending order, ensuring that the most recently modified experiment comes first.
 48 | # MAGIC
 49 | # MAGIC 4. **Retrieve Latest Experiment**: 
 50 | # MAGIC    - The most recently updated experiment is then extracted from the sorted list and stored in the `latest_experiment` variable.
 51 | # MAGIC
 52 | # MAGIC 5. **Display Experiment Name**: 
 53 | # MAGIC    - The name of the most recently updated experiment is printed out for confirmation.
 54 | # MAGIC
 55 | # MAGIC > **Note**: If you are specifically interested in the experiment related to AutoML for base model creation, make sure that the `latest_experiment` actually corresponds to that particular experiment.
 56 | # MAGIC
 57 | 
 58 | # COMMAND ----------
 59 | 
 60 | import mlflow
 61 | 
 62 | # Initialize the MLflow client
 63 | client = mlflow.tracking.MlflowClient()
 64 | 
 65 | # Fetch all available experiments
 66 | experiments = client.search_experiments()
 67 | 
 68 | # Sort the experiments by their last update time in descending order
 69 | sorted_experiments = sorted(experiments, key=lambda x: x.last_update_time, reverse=True)
 70 | 
 71 | # Retrieve the most recently updated experiment
 72 | latest_experiment = sorted_experiments[0]
 73 | 
 74 | # Output the name of the latest experiment
 75 | print(f"The most recently updated experiment is named '{latest_experiment.name}'.")
 76 | 
 77 | # Note: If you're specifically looking for the experiment related to AutoML for base model creation,
 78 | # ensure that 'latest_experiment' corresponds to that experiment.
 79 | 
 80 | # COMMAND ----------
 81 | 
 82 | # MAGIC %md
 83 | # MAGIC ### Identifying the Best Model Run ID from a Specific Experiment in MLflow
 84 | # MAGIC
 85 | # MAGIC In this code snippet, the objective is multi-fold:
 86 | # MAGIC
 87 | # MAGIC 1. **Fetch Current User's Username**: 
 88 | # MAGIC    - Utilizes Databricks utilities to programmatically fetch the username. This could be useful for traceability or logging purposes.
 89 | # MAGIC
 90 | # MAGIC 2. **Set Experiment and Model Names**: 
 91 | # MAGIC    - Retrieves the name of the most recently updated experiment, assumed to have been set in earlier steps.
 92 | # MAGIC    - Defines a specific name for the model in the registry, which in this case is "Churn Prediction Bank".
 93 | # MAGIC
 94 | # MAGIC 3. **Fetch and Sort Experiment Runs**: 
 95 | # MAGIC    - Retrieves the details of the experiment using its name.
 96 | # MAGIC    - Searches for all runs within the experiment and sorts them based on the F1 score on the validation set, in descending order.
 97 | # MAGIC
 98 | # MAGIC 4. **Identify the Best Model Run ID**: 
 99 | # MAGIC    - The run ID corresponding to the highest validation F1 score is then stored in the `best_run_id` variable.
100 | # MAGIC
101 | # MAGIC > **Note**: The `best_run_id` variable now holds the run ID of the model that performed best in the specified experiment, according to the F1 score on the validation set.
102 | # MAGIC
103 | # MAGIC
104 | 
105 | # COMMAND ----------
106 | 
107 | # Initialize the Databricks utilities to programmatically fetch the username
108 | username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
109 | 
110 | # Retrieve the name of the latest experiment; assumed to have been set in earlier steps
111 | experiment_name = latest_experiment.name
112 | 
113 | # Define the model name for the registry, specific to our use-case of Churn Prediction for a Bank
114 | registry_model_name = "Churn Prediction Bank"
115 | 
116 | # Fetch the experiment details using its name
117 | experiment_details = client.get_experiment_by_name(experiment_name)
118 | 
119 | # Search for runs within the experiment and sort them by validation F1 score in descending order
120 | sorted_runs = mlflow.search_runs(experiment_details.experiment_id).sort_values("metrics.val_f1_score", ascending=False)
121 | 
122 | # Get the run ID of the best model based on the highest validation F1 score
123 | best_run_id = sorted_runs.loc[0, "run_id"]
124 | 
125 | best_run_id
126 | # Note: The variable `best_run_id` now contains the run ID of the best model in the specified experiment
127 | 
128 | # COMMAND ----------
129 | 
130 | # MAGIC %md
131 | # MAGIC ### Registering the Best Model in MLflow's Model Registry
132 | # MAGIC
133 | # MAGIC The aim of this code block is to register the best-performing model (based on the highest validation F1 score) in MLflow's model registry. Here's how it does it:
134 | # MAGIC
135 | # MAGIC 1. **Initialize Model URI**: 
136 | # MAGIC    - Constructs the model URI using the `best_run_id` obtained from previous steps. The URI will uniquely identify the model's location.
137 | # MAGIC
138 | # MAGIC 2. **Attempt Model Registration**: 
139 | # MAGIC    - Tries to register the model under the name specified by `registry_model_name`.
140 | # MAGIC   
141 | # MAGIC 3. **Success and Failure Scenarios**: 
142 | # MAGIC    - Prints a success message along with the model URI if the model registration is successful.
143 | # MAGIC    - Captures and prints an error message if it fails to register the model.
144 | # MAGIC
145 | # MAGIC > **Note**: The `model_details` variable will be populated with details about the registered model if the registration is successful. These details include the model name, version, and other metadata.
146 | # MAGIC
147 | 
148 | # COMMAND ----------
149 | 
150 | # Initialize the model's URI using the best run ID obtained from previous steps
151 | model_uri = f"runs:/{best_run_id}/model"
152 | 
153 | # Register the model in MLflow's model registry under the specified name
154 | try:
155 |     model_details = mlflow.register_model(model_uri=model_uri, name=registry_model_name)
156 |     print(f"Successfully registered model '{registry_model_name}' with URI '{model_uri}'.")
157 | except mlflow.exceptions.MlflowException as e:
158 |     print(f"Failed to register model '{registry_model_name}': {str(e)}")
159 | 
160 | model_details
161 | # Note: The variable `model_details` now contains details about the registered model
162 | 
163 | # COMMAND ----------
164 | 
165 | # MAGIC %md
166 | # MAGIC ### Updating Model Metadata in the MLflow Model Registry
167 | # MAGIC
168 | # MAGIC In this step, we accomplish two primary tasks:
169 | # MAGIC
170 | # MAGIC 1. **Update Registered Model Metadata**: 
171 | # MAGIC    - We attempt to update the description of an already registered model in the MLflow Model Registry. 
172 | # MAGIC    - The description aims to clarify the purpose of the model, in this case, "This model predicts whether a bank customer will churn or not."
173 | # MAGIC
174 | # MAGIC 2. **Update Version-Specific Metadata**:
175 | # MAGIC    - We update the metadata for a specific version of the model. 
176 | # MAGIC    - Here, we add a description specifying that this model version is based on scikit-learn.
177 | # MAGIC
178 | # MAGIC Both operations are wrapped in try-except blocks for robust error handling. Should any operation fail, an error message will be printed to provide insight into the failure.
179 | # MAGIC
180 | # MAGIC > **Note**: The `model_details` variable is assumed to contain essential information about the registered model and its specific version.
181 | # MAGIC
182 | 
183 | # COMMAND ----------
184 | 
185 | # Update the metadata of an already registered model
186 | try:
187 |     client.update_registered_model(
188 |         name=model_details.name,
189 |         description="This model predicts whether a bank customer will churn or not."
190 |     )
191 |     print(f"Successfully updated the description for the registered model '{model_details.name}'.")
192 | except mlflow.exceptions.MlflowException as e:
193 |     print(f"Failed to update the registered model '{model_details.name}': {str(e)}")
194 | 
195 | # Update the metadata for a specific version of the model
196 | try:
197 |     client.update_model_version(
198 |         name=model_details.name,
199 |         version=model_details.version,
200 |         description="This is a scikit-learn based model."
201 |     )
202 |     print(f"Successfully updated the description for version {model_details.version} of the model '{model_details.name}'.")
203 | except mlflow.exceptions.MlflowException as e:
204 |     print(f"Failed to update version {model_details.version} of the model '{model_details.name}': {str(e)}")
205 | 
206 | # Note: The `model_details` variable is assumed to contain details about the registered model and its version
207 | 
208 | # COMMAND ----------
209 | 
210 | # MAGIC %md
211 | # MAGIC ### Transitioning Model Version to 'Staging' Stage in the MLflow Model Registry
212 | # MAGIC
213 | # MAGIC In this step, the following objectives are met:
214 | # MAGIC
215 | # MAGIC 1. **Transition Model Version**:
216 | # MAGIC    - We aim to transition a specific version of the registered model to the 'Staging' stage in the MLflow Model Registry.
217 | # MAGIC   
218 | # MAGIC 2. **Archiving Existing Versions**: 
219 | # MAGIC    - The `archive_existing_versions=True` flag ensures that any pre-existing versions of the model in the 'Staging' stage are archived. This helps in keeping only the most relevant version in the stage.
220 | # MAGIC
221 | # MAGIC 3. **Error Handling**: 
222 | # MAGIC    - The operation is wrapped in a try-except block. If the transition operation fails for any reason, a detailed error message will be displayed to help diagnose the issue.
223 | # MAGIC
224 | # MAGIC > **Note**: Successful completion will print a message confirming the successful transition of the model version to the 'Staging' stage.
225 | # MAGIC
226 | # MAGIC
227 | 
228 | # COMMAND ----------
229 | 
230 | # Transition the model version to the 'Staging' stage in the model registry
231 | try:
232 |     client.transition_model_version_stage(
233 |         name=model_details.name,
234 |         version=model_details.version,
235 |         stage="Staging",
236 |         archive_existing_versions=True  # Archives any existing versions in the 'Staging' stage
237 |     )
238 |     print(f"Successfully transitioned version {model_details.version} of the model '{model_details.name}' to 'Staging'.")
239 | except mlflow.exceptions.MlflowException as e:
240 |     print(f"Failed to transition version {model_details.version} of the model '{model_details.name}' to 'Staging': {str(e)}")
241 | 
242 | 
243 | # COMMAND ----------
244 | 
245 | # MAGIC %md
246 | # MAGIC ### Model Registry Webhooks
247 | # MAGIC
248 | # MAGIC ### Supported Events
249 | # MAGIC * **MODEL_VERSION_CREATED**: A new model version was created for the associated model.
250 | # MAGIC * **MODEL_VERSION_TRANSITIONED_STAGE**: A model version’s stage was changed.
251 | # MAGIC * **TRANSITION_REQUEST_CREATED**: A user requested a model version’s stage be transitioned.
252 | # MAGIC * **COMMENT_CREATED**: A user wrote a comment on a registered model.
253 | # MAGIC * **REGISTERED_MODEL_CREATED**: A new registered model was created. This event type can only be specified for a registry-wide webhook, which can be created by not specifying a model name in the create request.
254 | # MAGIC * **MODEL_VERSION_TAG_SET**: A user set a tag on the model version.
255 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_STAGING**: A model version was transitioned to staging.
256 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_PRODUCTION**: A model version was transitioned to production.
257 | # MAGIC * **MODEL_VERSION_TRANSITIONED_TO_ARCHIVED**: A model version was archived.
258 | # MAGIC * **TRANSITION_REQUEST_TO_STAGING_CREATED**: A user requested a model version be transitioned to staging.
259 | # MAGIC * **TRANSITION_REQUEST_TO_PRODUCTION_CREATED**: A user requested a model version be transitioned to production.
260 | # MAGIC * **TRANSITION_REQUEST_TO_ARCHIVED_CREATED**: A user requested a model version be archived.
261 | # MAGIC
262 | # MAGIC ### Types of webhooks
263 | # MAGIC * **HTTP webhook** &mdash; send triggers to endpoints of your choosing such as slack, AWS Lambda, Azure Functions, or GCP Cloud Functions
264 | # MAGIC * **Job webhook** &mdash; trigger a job within the Databricks workspace
265 | 
266 | # COMMAND ----------
267 | 
268 | # MAGIC %md
269 | # MAGIC ## MLflow Endpoint Utility Functions
270 | # MAGIC
271 | # MAGIC This script contains utility functions to interact with MLflow REST API endpoints. The code imports necessary modules, initializes an MLflow client, and defines a series of functions to handle REST API calls. Below are the key components:
272 | # MAGIC
273 | # MAGIC ### Import Statements
274 | # MAGIC
275 | # MAGIC - `http_request from mlflow.utils.rest_utils`: Required for making HTTP requests to the MLflow server.
276 | # MAGIC - `json`: Standard library for handling JSON formatted data.
277 | # MAGIC
278 | # MAGIC ### Functions
279 | # MAGIC
280 | # MAGIC #### `get_mlflow_client()`
281 | # MAGIC - **Purpose**: Returns an initialized MLflowClient object for further operations.
282 | # MAGIC - **Return Type**: `MlflowClient`
283 | # MAGIC
284 | # MAGIC #### `get_host_creds(client)`
285 | # MAGIC - **Parameters**: `client` - Initialized MlflowClient object.
286 | # MAGIC - **Purpose**: Fetches the host and token credentials from the MLflow tracking server.
287 | # MAGIC - **Return Type**: Host and token credentials.
288 | # MAGIC
289 | # MAGIC #### `mlflow_call_endpoint(endpoint, method, body='{}')`
290 | # MAGIC - **Parameters**:
291 | # MAGIC   - `endpoint` (str): The MLflow API endpoint to call.
292 | # MAGIC   - `method` (str): HTTP method to use ('GET' or other HTTP methods).
293 | # MAGIC   - `body` (str, optional): JSON-formatted request payload, default is an empty JSON object.
294 | # MAGIC - **Purpose**: Makes a REST API call to the specified MLflow endpoint.
295 | # MAGIC - **Return Type**: Dictionary containing the JSON response from the API call or `None` if the request fails.
296 | # MAGIC - **Error Handling**: Captures exceptions and prints an error message detailing the failure.
297 | # MAGIC
298 | # MAGIC ### Client Initialization and Credential Retrieval
299 | # MAGIC
300 | # MAGIC After defining the functions, the script initializes an `MlflowClient` object and fetches the host and token credentials.
301 | # MAGIC
302 | # MAGIC - `client = get_mlflow_client()`: Initializes the client.
303 | # MAGIC - `host_creds = get_host_creds(client)`: Retrieves host and token credentials.
304 | # MAGIC - `host = host_creds.host`: Extracts the host.
305 | # MAGIC - `token = host_creds.token`: Extracts the token.
306 | # MAGIC
307 | # MAGIC
308 | 
309 | # COMMAND ----------
310 | 
311 | from mlflow.utils.rest_utils import http_request
312 | import json
313 | 
314 | def get_mlflow_client():
315 |     """Returns an initialized MLflowClient object."""
316 |     return mlflow.tracking.client.MlflowClient()
317 | 
318 | def get_host_creds(client):
319 |     """Fetches host and token credentials."""
320 |     return client._tracking_client.store.get_host_creds()
321 | 
322 | def mlflow_call_endpoint(endpoint, method, body='{}'):
323 |     """Calls an MLflow REST API endpoint.
324 |     
325 |     Parameters:
326 |         endpoint (str): The endpoint to call.
327 |         method (str): HTTP method ('GET' or other HTTP methods).
328 |         body (str): JSON-formatted request payload.
329 |         
330 |     Returns:
331 |         dict: JSON response as a dictionary.
332 |     """
333 |     host_creds = get_host_creds(get_mlflow_client())
334 |     
335 |     try:
336 |         if method == 'GET':
337 |             response = http_request(
338 |                 host_creds=host_creds,
339 |                 endpoint=f"/api/2.0/mlflow/{endpoint}",
340 |                 method=method,
341 |                 params=json.loads(body)
342 |             )
343 |         else:
344 |             response = http_request(
345 |                 host_creds=host_creds,
346 |                 endpoint=f"/api/2.0/mlflow/{endpoint}",
347 |                 method=method,
348 |                 json=json.loads(body)
349 |             )
350 |         
351 |         return response.json()
352 |         
353 |     except Exception as e:
354 |         print(f"Failed to call MLflow endpoint '{endpoint}': {str(e)}")
355 |         return None
356 | 
357 | 
358 | client = get_mlflow_client()
359 | host_creds = get_host_creds(client)
360 | host = host_creds.host
361 | token = host_creds.token
362 | 
363 | # COMMAND ----------
364 | 
365 | # MAGIC %md
366 | # MAGIC ### Setting Up Slack Notifications and Webhooks
367 | # MAGIC
368 | # MAGIC You can read more about Slack webhooks [here](https://api.slack.com/messaging/webhooks#create_a_webhook).
369 | # MAGIC
370 | # MAGIC First, we set up a webhook to notify us whenever a **New model version is created**.
371 | # MAGIC
372 | # MAGIC In the next cell assign the slack_webhook variable the link to your webhook. It should look as follows`"https://hooks.slack.com/services/?????????/??????????/????????????????????????"`
373 | 
374 | # COMMAND ----------
375 | 
376 | slack_webhook = "https://hooks.slack.com/services/?????????/??????????/???????????????????????"
377 | 
378 | # COMMAND ----------
379 | 
380 | import json 
381 | 
382 | trigger_for_slack = json.dumps({
383 |   "model_name": registry_model_name,
384 |   "events": ["MODEL_VERSION_CREATED"],
385 |   "description": "Triggered when a new model version is created.",
386 |   "http_url_spec": {
387 |     "url": slack_webhook
388 |   }
389 | })
390 |  
391 | mlflow_call_endpoint("registry-webhooks/create", method = "POST", body = trigger_for_slack)
392 | 
393 | # COMMAND ----------
394 | 
395 | # MAGIC %md
396 | # MAGIC Similarly we can create a webhook that notifies us when a **New transition request is made for a mode version**.
397 | 
398 | # COMMAND ----------
399 | 
400 | trigger_for_slack = json.dumps({
401 |   "model_name": registry_model_name,
402 |   "events": ["TRANSITION_REQUEST_CREATED"],
403 |   "description": "Triggered when a new transition request for a model has been made.",
404 |   "http_url_spec": {
405 |     "url": slack_webhook
406 |   }
407 | })
408 | 
409 | mlflow_call_endpoint("registry-webhooks/create", method = "POST", body = trigger_for_slack)
410 | 
411 | # COMMAND ----------
412 | 
413 | # MAGIC %md
414 | # MAGIC ### Listing all webhooks.
415 | 
416 | # COMMAND ----------
417 | 
418 | list_model_webhooks = json.dumps({"model_name": registry_model_name})
419 | 
420 | model_webhooks = mlflow_call_endpoint("registry-webhooks/list", method = "GET", body = list_model_webhooks)
421 | model_webhooks
422 | 
423 | # COMMAND ----------
424 | 
425 | # MAGIC %md
426 | # MAGIC You can also **delete webhooks**.
427 | # MAGIC
428 | # MAGIC You can use the below cell to delete webhooks by ID or delete all the webhooks for a specific model.
429 | 
430 | # COMMAND ----------
431 | 
432 | # for webhook in model_webhooks["webhooks"]:
433 | #     mlflow_call_endpoint(
434 | #     "registry-webhooks/delete",
435 | #     method="DELETE",
436 | #     body=json.dumps({'id': webhook["id"]})
437 | # )
438 | 
439 | # COMMAND ----------
440 | 
441 | 
442 | 


--------------------------------------------------------------------------------
/Chapter-09/model-drift.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC %md
  4 | # MAGIC ## Author
  5 | # MAGIC
  6 | # MAGIC - **Debu Sinha**
  7 | # MAGIC
  8 | # MAGIC ## Tested Environment
  9 | # MAGIC
 10 | # MAGIC - **Databricks Runtime**: This notebook is tested on Databricks Runtime for Machine Learning 13.3 LTS or above.
 11 | # MAGIC - **Cluster Configuration**: Single node cluster with at least 32GB RAM and 4 VCPU.
 12 | # MAGIC - **Note**: The same cluster set up in Chapters 3 and 4 will be used here.
 13 | # MAGIC
 14 | # MAGIC ## Cluster Setup Instructions
 15 | # MAGIC
 16 | # MAGIC 1. **Create a Cluster**: 
 17 | # MAGIC     - Navigate to the `Compute` icon on the left sidebar and click on `Create Cluster`.
 18 | # MAGIC     - Under `Policy`, select `Unrestricted`.
 19 | # MAGIC     - Enter a name for your cluster, for example, `demo`, into the cluster name text box.
 20 | # MAGIC     - In `Cluster Mode`, select `Single Node`.
 21 | # MAGIC     - Choose `Databricks Runtime Version` 13.3 LTS (Scala 2.12, Spark 3.4.1) from the `ML` tab.
 22 | # MAGIC     - On `AWS`, select `i3.xlarge` / on `Azure`, select `Standard_DS4_V2` as __Node type__.
 23 | # MAGIC     - Click on `Create Cluster` and wait for your cluster to be provisioned.
 24 | # MAGIC
 25 | # MAGIC 2. **Attach this Notebook to Your Cluster**: 
 26 | # MAGIC     - Click on the menu labeled `Detached` at the top left of this workbook.
 27 | # MAGIC     - Select your cluster name to attach this notebook to your cluster.
 28 | # MAGIC --------
 29 | # MAGIC ### Outline
 30 | # MAGIC
 31 | # MAGIC We simulate a batch inference scenario where we train, deploy, and maintain a model to predict monthly Sales for ecommerce website on monthly basis. 
 32 | # MAGIC
 33 | # MAGIC **Data interval**: Arrives monthly <br> 
 34 | # MAGIC **Date range**: 01/01/2023 - 03/31/2023
 35 | # MAGIC
 36 | # MAGIC **Workflow**: 
 37 | # MAGIC * Load the new month of incoming data
 38 | # MAGIC * Apply incoming data checks 
 39 | # MAGIC   * Error and drift evaluation
 40 | # MAGIC * Identify and address any errors and drifts
 41 | # MAGIC * Train a new model
 42 | # MAGIC * Apply model validation checks versus the existing model in production
 43 | # MAGIC     * If checks pass, deploy the new candidate model to production
 44 | # MAGIC     * If checks fail, do not deploy the new candidate model <br>
 45 | # MAGIC     
 46 | # MAGIC **Reproducibility Tools**: 
 47 | # MAGIC * [MLflow](https://www.mlflow.org/docs/latest/index.html) for model parameters, metrics, and artifacts
 48 | # MAGIC * [Delta](https://docs.delta.io/latest/index.html) for data versioning <br>
 49 | # MAGIC
 50 | # MAGIC Although this notebook specifically addresses tests to monitor a supervised ML model for batch inference, the same tests are applicable in streaming and real-time settings.
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | # MAGIC %md
 55 | # MAGIC
 56 | # MAGIC ### Run setup and utils notebooks
 57 | 
 58 | # COMMAND ----------
 59 | 
 60 | # MAGIC %run ./config/setup
 61 | 
 62 | # COMMAND ----------
 63 | 
 64 | # MAGIC %run ./util/training
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | # MAGIC %run ./data/datagen
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | # MAGIC %run ./util/monitoring
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # Remove all existing widgets
 77 | dbutils.widgets.removeAll()
 78 | 
 79 | # Create three widgets for the stats threshold limit, p-threshold, and min model R2 threshold
 80 | dbutils.widgets.text("stats_threshold_limit", "0.5")
 81 | dbutils.widgets.text("p_threshold", "0.05")
 82 | dbutils.widgets.text("min_model_r2_threshold", "0.005")
 83 | 
 84 | # Get the values of the widgets
 85 | # stats_threshold_limit: how much we should allow basic summary stats to shift
 86 | stats_threshold_limit = float(dbutils.widgets.get("stats_threshold_limit"))
 87 | 
 88 | # p_threshold: the p-value below which to reject null hypothesis
 89 | p_threshold = float(dbutils.widgets.get("p_threshold"))
 90 | 
 91 | # min_model_r2_threshold: minimum model improvement
 92 | min_model_r2_threshold = float(dbutils.widgets.get("min_model_r2_threshold"))
 93 | 
 94 | # COMMAND ----------
 95 | 
 96 | # MAGIC %md
 97 | # MAGIC # Model Drift Dummy Dataset
 98 | 
 99 | # COMMAND ----------
100 | 
101 | # MAGIC %md
102 | # MAGIC
103 | # MAGIC
104 | # MAGIC ### Month 1 - Base line Data
105 | # MAGIC
106 | # MAGIC We have generated a dummy dataset to showcase model drift. The dataset consists of time series data for three months. The independent features of the dataset are:
107 | # MAGIC
108 | # MAGIC | Feature | Type | Description |
109 | # MAGIC |---|---|---|
110 | # MAGIC | Date | date | The date for which the record belongs. |
111 | # MAGIC | Temperature | numeric | The highest daily temperature in Fahrenheit. |
112 | # MAGIC | Weather_Condition | categorical | The weather condition, which can be sunny, cloudy, or rainy. |
113 | # MAGIC | Promotion_Type | categorical | The type of promotion, which can be a discount, free gift, or bundle deal. |
114 | # MAGIC | Website_Traffic | numeric | The total website traffic. |
115 | # MAGIC | Device_Type | categorical | The type of device used to access the website, which can be mobile, desktop, or tablet. |
116 | # MAGIC
117 | # MAGIC The target variable of the dataset is Daily_Sales (numeric). Daily_Sales has the following correlations with the independent features for the first month:
118 | # MAGIC
119 | # MAGIC * Positive correlation with Temperature and Website_Traffic.
120 | # MAGIC * Negative correlation with Weather_Condition and Device_Type.
121 | # MAGIC
122 | # MAGIC ### Data and Model Management
123 | # MAGIC
124 | # MAGIC #### Variables
125 | # MAGIC
126 | # MAGIC The following variables are also defined during our setup to help with execution down the line:
127 | # MAGIC
128 | # MAGIC Variable | Description
129 | # MAGIC ---|---
130 | # MAGIC `project_home_dir` | The path to the project home directory.
131 | # MAGIC `raw_good_data_path` | The path to the directory where the raw data is stored as csv.
132 | # MAGIC `raw_month2_bad_data_path` | The path to the directory where the bad data for simulating feature drift is stored as csv.
133 | # MAGIC `months_gold_path` | The path to the directory where the clean and processed data is stored in Delta format.
134 | # MAGIC `mlflow_experiment_name` | The name of the MLflow experiment where the model will be registered.
135 | # MAGIC `mlflow_experiment_path` | The path relative to our home directory in the workspace where the experiment will be located.
136 | # MAGIC
137 | # MAGIC
138 | # MAGIC
139 | # MAGIC
140 | 
141 | # COMMAND ----------
142 | 
143 | print(f'good raw data file location : {raw_good_data_path}')
144 | print(f'bad raw data location : {raw_month2_bad_data_path}')
145 | print(f'Gold Delta table path : {months_gold_path}')
146 | print(f'MLflow experiment name : {mlflow_experiment_name}')
147 | print(f'MLflow experiment path : {mlflow_experiment_path}')
148 | 
149 | # COMMAND ----------
150 | 
151 | # MAGIC %md
152 | # MAGIC #### i. Initial Data load
153 | # MAGIC
154 | # MAGIC Load the first month of data which we use to train and evaluate our first model.
155 | # MAGIC
156 | # MAGIC We create a "Gold" table to which we will be appending each subsequent month of data.
157 | # MAGIC
158 | 
159 | # COMMAND ----------
160 | 
161 | # Ensure we start with no existing Delta table 
162 | dbutils.fs.rm(months_gold_path, True)
163 | 
164 | # Incoming Month 1 Data
165 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True)
166 | 
167 | # Filter the DataFrame to only include data for January 2023
168 | raw_data_month1 = raw_data.filter(raw_data["Date"].between("2023-01-01", "2023-01-31"))
169 | 
170 | # Print the filtered DataFrame
171 | raw_data_month1.show()
172 | 
173 | # COMMAND ----------
174 | 
175 | import pyspark.sql.functions as F
176 | # Create inital version of the Gold Delta table we will use for training - this will be updated with subsequent "months" of data
177 | raw_data_month1.withColumn("month", F.lit("month_1")).write.format("delta").mode("overwrite").partitionBy("month").save(months_gold_path)
178 | 
179 | # COMMAND ----------
180 | 
181 | #list files in the gold delta table path
182 | display(dbutils.fs.ls(months_gold_path))
183 | 
184 | # COMMAND ----------
185 | 
186 | # MAGIC %md
187 | # MAGIC #### ii. Model Training
188 | 
189 | # COMMAND ----------
190 | 
191 | #read gold data for month 1 from the Delta table
192 | month1_gold_delta_table = DeltaTable.forPath(spark, path=months_gold_path)
193 | month1_gold_df = month1_gold_delta_table.toDF()
194 | 
195 | # Set the month number - used for naming the MLflow run and tracked as a parameter 
196 | month = 1
197 | 
198 | # Specify name of MLflow run
199 | run_name = f"month_{month}"
200 | 
201 | target_col = "Daily_Sales"
202 | cat_cols = [col[0] for col in month1_gold_df.dtypes if col[1]=="string" and col[0]!='month']
203 | num_cols= [col[0] for col in month1_gold_df.dtypes if ((col[1]=="int" or col[1]=="double") and col[0]!="Daily_Sales") ]
204 | 
205 | print(f"category columns : {cat_cols}")
206 | print(f"numeric columns : {num_cols}")
207 | print(f"target column : {target_col}")
208 | 
209 | # Define the parameters to pass in the RandomForestRegressor model
210 | model_params = {"n_estimators": 500,
211 |                 "max_depth": 5,
212 |                 "max_features": "log2"}
213 | 
214 | # Define a dictionary of parameters that we would like to use during preprocessing
215 | misc_params = {"month": month,
216 |                "target_col": target_col,
217 |                "cat_cols": cat_cols,
218 |                "num_cols": num_cols}             
219 | 
220 | # COMMAND ----------
221 | 
222 | # Trigger model training and logging to MLflow
223 | month1_run = train_sklearn_rf_model(run_name, 
224 |                         months_gold_path, 
225 |                         model_params, 
226 |                         misc_params)
227 | 
228 | 
229 | month_1_run_id = month1_run.info.run_id                        
230 | 
231 | # COMMAND ----------
232 | 
233 | # MAGIC %md
234 | # MAGIC
235 | # MAGIC #### iii. Model Deployment
236 | # MAGIC We first register the model to the MLflow Model Registry. For demonstration purposes, we will immediately transition the model to the "Production" stage in the MLflow Model Registry. However, in a real-world scenario, one should have a robust model validation process in place prior to migrating a model to Production.
237 | # MAGIC
238 | # MAGIC We will demonstrate a multi-stage approach in the subsequent sections:
239 | # MAGIC 1. Transitioning the model to the "Staging" stage.
240 | # MAGIC 2. Conducting model validation checks.
241 | # MAGIC 3. Only then, triggering a transition from Staging to Production once these checks are satisfied.
242 | # MAGIC
243 | # MAGIC
244 | 
245 | # COMMAND ----------
246 | 
247 | # Register model to MLflow Model Registry
248 | month_1_model_version = mlflow.register_model(model_uri=f"runs:/{month_1_run_id}/model", name=mlflow_experiment_name)
249 | 
250 | # COMMAND ----------
251 | 
252 | # Transition model to Production
253 | month_1_model_version = transition_model(month_1_model_version, stage="Production")
254 | print(month_1_model_version)
255 | 
256 | # COMMAND ----------
257 | 
258 | # MAGIC %md
259 | # MAGIC
260 | # MAGIC ### Month 2 - Arrival of New Data
261 | # MAGIC
262 | # MAGIC After deploying our model for a month, we are now faced with the arrival of a fresh month's worth of data. Let's explore two scenarios related to this new data:
263 | # MAGIC
264 | # MAGIC **Scenario 1: Missing values in website_traffic**
265 | # MAGIC An updated upstream Data cleaning process has a bug causing the the value of, `website_traffic` counts for promotion type `bundle_deal` and `free_gift` to be empty.  
266 | # MAGIC
267 | # MAGIC **Scenario 2: Introduction of new measurement for temperature**
268 | # MAGIC Also during the upstream data generation procedure a the temperature values are now being captured in __Celcius__ rather than in __Fahrenheit__.
269 | # MAGIC
270 | # MAGIC **What are we simulating here?**
271 | # MAGIC In this scenario, we are simulating two important factors:
272 | # MAGIC - Feature drift: The characteristics of the data have changed over time, specifically with missing `website_traffic` entries for `bundle_deal` and `free_gift`.
273 | # MAGIC - Upstream data errors: Unexpected changes or additions in the data generation process, such as the introduction of a different unit of measuring temperature.
274 | 
275 | # COMMAND ----------
276 | 
277 | # MAGIC %md
278 | # MAGIC #### i. Feature checks prior to model training
279 | # MAGIC
280 | # MAGIC **All features**
281 | # MAGIC * Null checks
282 | # MAGIC
283 | # MAGIC **Numeric features**
284 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum
285 | # MAGIC * Distribution checks
286 | # MAGIC
287 | # MAGIC **Categorical features**
288 | # MAGIC * Check expected count for each level
289 | # MAGIC * Check the mode
290 | 
291 | # COMMAND ----------
292 | 
293 | 
294 | # Incoming Month 2 Data
295 | raw_data_month2 = spark.read.csv(raw_month2_bad_data_path, header=True, inferSchema=True)
296 | 
297 | # Filter the DataFrame to only include data for Feb 2023
298 | raw_data_month2 = raw_data_month2.filter(raw_data_month2["Date"].between("2023-02-01", "2023-02-28"))
299 | 
300 | # Print the filtered DataFrame
301 | raw_data_month2.show(5)
302 | 
303 | # COMMAND ----------
304 | 
305 | # Compute summary statistics on new incoming data
306 | # we will keep only the columns that we monitored for the last mode training data
307 | # convert to pandas dataframe should be used with care as if the size of data is larger than what can fit on driver node then this can cause failures.
308 | # In the case of data size being large use proper sampling technique to estimate population summary statistics.
309 | month_2_pdf = raw_data_month2.toPandas().drop(['Date'], axis=1)
310 | summary_stats_month_2_pdf = calculate_summary_stats(month_2_pdf)
311 | summary_stats_month_2_pdf
312 | 
313 | # COMMAND ----------
314 | 
315 | # Get the original MLflow run associated with the model registered under Production
316 | current_prod_run = get_run_from_registered_model(mlflow_experiment_name, stage="Production")
317 | 
318 | # Load in original versions of Delta table used at training time for current Production model
319 | current_prod_pdf = load_delta_table_from_run(current_prod_run).toPandas()
320 | 
321 | # Load summary statistics pandas DataFrame for data which the model currently in Production was trained and evaluated against
322 | current_prod_stats_pdf = load_summary_stats_pdf_from_run(current_prod_run, project_local_tmp_dir)
323 | current_prod_stats_pdf
324 | 
325 | # COMMAND ----------
326 | 
327 | # MAGIC %md
328 | # MAGIC
329 | # MAGIC **All features**
330 | # MAGIC * Null checks
331 | 
332 | # COMMAND ----------
333 | 
334 | print("\nCHECKING PROPORTION OF NULLS.....")
335 | check_null_proportion(month_2_pdf, null_proportion_threshold=.5)
336 | 
337 | # COMMAND ----------
338 | 
339 | # MAGIC %md
340 | # MAGIC **Numeric features**
341 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum
342 | # MAGIC * Distribution checks
343 | 
344 | # COMMAND ----------
345 | 
346 | statistic_list = ["mean", "median", "std", "min", "max"]
347 | 
348 | unique_feature_diff_array_month_2 = check_diff_in_summary_stats(summary_stats_month_2_pdf, 
349 |                                                                 current_prod_stats_pdf, 
350 |                                                                 num_cols + [target_col], 
351 |                                                                 stats_threshold_limit, 
352 |                                                                 statistic_list)
353 | 
354 | unique_feature_diff_array_month_2
355 | 
356 | # COMMAND ----------
357 | 
358 | print(f"Let's look at the box plots of the features that exceed the stats_threshold_limit of {stats_threshold_limit}")
359 | plot_boxplots(unique_feature_diff_array_month_2, current_prod_pdf, month_2_pdf)
360 | 
361 | # COMMAND ----------
362 | 
363 | print("\nCHECKING VARIANCES WITH LEVENE TEST.....")
364 | check_diff_in_variances(current_prod_pdf, month_2_pdf, num_cols, p_threshold)
365 | 
366 | print("\nCHECKING KS TEST.....")
367 | check_dist_ks_bonferroni_test(current_prod_pdf, month_2_pdf, num_cols + [target_col], p_threshold)
368 | 
369 | # COMMAND ----------
370 | 
371 | # MAGIC %md
372 | # MAGIC
373 | # MAGIC **Categorical features**
374 | # MAGIC * Check expected count for each level
375 | # MAGIC * Check the mode
376 | 
377 | # COMMAND ----------
378 | 
379 | check_categorical_diffs(current_prod_pdf, month_2_pdf, cat_cols, p_threshold)
380 | 
381 | # COMMAND ----------
382 | 
383 | # MAGIC %md
384 | # MAGIC
385 | # MAGIC **`Action`: Resolve Data issues**
386 | # MAGIC
387 | # MAGIC After identifying data issues with `Temperature` and `Website_Traffic` and collaborating with the upstream data processing team, we have successfully resolved these issues. The fixed data for the new month is incorporated into our Gold Delta table. Subsequently, we proceed with training on the updated dataset to leverage the newly available information.
388 | 
389 | # COMMAND ----------
390 | 
391 | # Incoming corrected Data
392 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True)
393 | 
394 | # Filter the DataFrame to only include data for January 2023
395 | raw_data_month2 = raw_data.filter(raw_data["Date"].between("2023-02-01", "2023-02-28"))
396 | 
397 | # Append new month of data to Gold Delta table to use for training
398 | raw_data_month2.withColumn("month", F.lit("month_2")).write.format("delta").partitionBy("month").mode("append").save(months_gold_path)
399 | 
400 | # COMMAND ----------
401 | 
402 | # MAGIC %md
403 | # MAGIC #### ii. Model Training
404 | # MAGIC
405 | # MAGIC Retrain the same model, but this time we are able to use an extra month of data
406 | 
407 | # COMMAND ----------
408 | 
409 | # Set the month number - used for naming the MLflow run and tracked as a parameter 
410 | month = 2
411 | 
412 | # Specify name of MLflow run
413 | run_name = f"month_{month}"
414 | 
415 | # Define the parameters to pass in the RandomForestRegressor model
416 | model_params = {"n_estimators": 500,
417 |                 "max_depth": 5,
418 |                 "max_features": "log2"}
419 | 
420 | # Define a dictionary of parameters that we would like to use during preprocessing
421 | misc_params = {"month": month,
422 |                "target_col": target_col,
423 |                "cat_cols": cat_cols,
424 |                "num_cols": num_cols}
425 | 
426 | # COMMAND ----------
427 | 
428 | # Trigger model training and logging to MLflow
429 | month2_run = train_sklearn_rf_model(run_name, 
430 |                         months_gold_path, 
431 |                         model_params, 
432 |                         misc_params)
433 | 
434 | 
435 | month_2_run_id = month2_run.info.run_id        
436 | 
437 | # COMMAND ----------
438 | 
439 | # Register model to MLflow Model Registry
440 | month_2_model_version = mlflow.register_model(model_uri=f"runs:/{month_2_run_id}/model", name=mlflow_experiment_name)
441 | 
442 | # Transition model to Staging
443 | month_2_model_version = transition_model(month_2_model_version, stage="Staging")
444 | print(month_2_model_version)
445 | 
446 | # COMMAND ----------
447 | 
448 | # MAGIC %md
449 | # MAGIC
450 | # MAGIC #### iii. Model checks prior to model deployment
451 | 
452 | # COMMAND ----------
453 | 
454 | # Get the original MLflow run associated with the model registered under Staging
455 | current_staging_run = get_run_from_registered_model(mlflow_experiment_name, stage="Staging")
456 | 
457 | metric_to_check = "r2_score_X_test"
458 | compare_model_perfs(current_staging_run, current_prod_run, min_model_r2_threshold, metric_to_check)
459 | 
460 | # COMMAND ----------
461 | 
462 | month_2_model_version = transition_model(month_2_model_version, stage="Production")
463 | print(month_2_model_version)
464 | 
465 | # COMMAND ----------
466 | 
467 | # MAGIC %md
468 | # MAGIC
469 | # MAGIC ### Month 3 - New Data Arrives
470 | # MAGIC
471 | # MAGIC We have had a model in production for 2 months now and have now obtained an additional month of data.
472 | # MAGIC
473 | # MAGIC **Scenario 2:**
474 | # MAGIC * A product campaign went viral on social media. Sales increased by 30% for each day.
475 | # MAGIC
476 | # MAGIC **What are we simulating here?**
477 | # MAGIC * Label drift
478 | # MAGIC * Concept drift
479 | # MAGIC   * The underlying relationship between the features and label has changed due to a viral marketing campaign.
480 | 
481 | # COMMAND ----------
482 | 
483 | # MAGIC %md
484 | # MAGIC #### i. Feature checks prior to model training
485 | # MAGIC
486 | # MAGIC **All features**
487 | # MAGIC * Null checks
488 | # MAGIC
489 | # MAGIC **Numeric features**
490 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum
491 | # MAGIC * Distribution checks
492 | # MAGIC
493 | # MAGIC **Categorical features**
494 | # MAGIC * Check expected count for each level
495 | # MAGIC * Check the mode
496 | 
497 | # COMMAND ----------
498 | 
499 | # Incoming Month 1 Data
500 | raw_data = spark.read.csv(raw_good_data_path, header=True, inferSchema=True)
501 | 
502 | # Filter the DataFrame to only include data for January 2023
503 | raw_data_month3 = raw_data.filter(raw_data["Date"].between("2023-03-01", "2023-03-31"))
504 | 
505 | # Print the filtered DataFrame
506 | raw_data_month3.show(5)
507 | 
508 | # COMMAND ----------
509 | 
510 | # Compute summary statistics on new incoming data
511 | # we will keep only the columns that we monitored for the last mode training data
512 | # convert to pandas dataframe should be used with care as if the size of data is larger than what can fit on driver node then this can cause failures.
513 | # In the case of data size being large use proper sampling technique to estimate population summary statistics.
514 | month_3_pdf = raw_data_month3.toPandas().drop(['Date'], axis=1)
515 | summary_stats_month_3_pdf = calculate_summary_stats(month_3_pdf)
516 | summary_stats_month_3_pdf
517 | 
518 | # COMMAND ----------
519 | 
520 | # Get the current MLflow run associated with the model registered under Production
521 | current_prod_run_2 = get_run_from_registered_model(mlflow_experiment_name, stage="Production")
522 | 
523 | # Load in original versions of Delta table used at training time for current Production model
524 | current_prod_pdf_2 = load_delta_table_from_run(current_prod_run_2).toPandas()
525 | 
526 | # Load summary statistics pandas DataFrame for data which the model currently in Production was trained and evaluated against
527 | current_prod_stats_pdf_2 = load_summary_stats_pdf_from_run(current_prod_run_2, project_local_tmp_dir)
528 | 
529 | # COMMAND ----------
530 | 
531 | # MAGIC %md
532 | # MAGIC
533 | # MAGIC **All features**
534 | # MAGIC * Null checks
535 | 
536 | # COMMAND ----------
537 | 
538 | print("\nCHECKING PROPORTION OF NULLS.....")
539 | check_null_proportion(month_3_pdf, null_proportion_threshold=.5)
540 | 
541 | # COMMAND ----------
542 | 
543 | # MAGIC %md
544 | # MAGIC **Numeric features**
545 | # MAGIC * Summary statistic checks: mean, median, standard deviation, minimum, maximum
546 | # MAGIC * Distribution checks
547 | 
548 | # COMMAND ----------
549 | 
550 | unique_feature_diff_array_month_3 = check_diff_in_summary_stats(summary_stats_month_3_pdf, 
551 |                                                                 current_prod_stats_pdf_2, 
552 |                                                                 num_cols + [target_col], 
553 |                                                                 stats_threshold_limit, 
554 |                                                                 statistic_list)
555 | 
556 | unique_feature_diff_array_month_3
557 | 
558 | # COMMAND ----------
559 | 
560 | print(f"Let's look at the box plots of the features that exceed the stats_threshold_limit of {stats_threshold_limit}")
561 | plot_boxplots(unique_feature_diff_array_month_3, current_prod_pdf_2, month_3_pdf)
562 | 
563 | # COMMAND ----------
564 | 
565 | print("\nCHECKING VARIANCES WITH LEVENE TEST.....")
566 | check_diff_in_variances(current_prod_pdf_2, month_3_pdf, num_cols, p_threshold)
567 | 
568 | print("\nCHECKING KS TEST.....")
569 | check_dist_ks_bonferroni_test(current_prod_pdf_2, month_3_pdf, num_cols + [target_col], p_threshold)
570 | 
571 | # COMMAND ----------
572 | 
573 | # MAGIC %md
574 | # MAGIC
575 | # MAGIC **Categorical features**
576 | # MAGIC * Check expected count for each level
577 | # MAGIC * Check the mode
578 | 
579 | # COMMAND ----------
580 | 
581 | check_categorical_diffs(current_prod_pdf_2, month_3_pdf, cat_cols, p_threshold)
582 | 
583 | # COMMAND ----------
584 | 
585 | # MAGIC %md
586 | # MAGIC
587 | # MAGIC **`Action`: Include new data with label drift in training**
588 | # MAGIC
589 | # MAGIC We observe that our label has drifted, and after analysis observe that this most recent month of data was captured during a spike in sales caused by a viral marketing campaign. As such, we will retrain our model and include this recent month of data during training.
590 | 
591 | # COMMAND ----------
592 | 
593 | # Append the new month of data (where listings are most expensive across the board)
594 | raw_data_month3.withColumn("month", F.lit("month_3")).write.format("delta").partitionBy("month").mode("append").save(months_gold_path)
595 | 
596 | # COMMAND ----------
597 | 
598 | # MAGIC %md
599 | # MAGIC #### ii. Model Training
600 | # MAGIC
601 | # MAGIC Retrain the same model from previous months, including the additional month of data where the label has drifted.
602 | 
603 | # COMMAND ----------
604 | 
605 | # Set the month number - used for naming the MLflow run and tracked as a parameter 
606 | month = 3
607 | 
608 | # Specify name of MLflow run
609 | run_name = f"month_{month}"
610 | 
611 | # Define the parameters to pass in the RandomForestRegressor model
612 | model_params = {"n_estimators": 500,
613 |                 "max_depth": 5,
614 |                 "max_features": "log2"}
615 | 
616 | # Define a dictionary of parameters that we would like to use during preprocessing
617 | misc_params = {"month": month,
618 |                "target_col": target_col,
619 |                "cat_cols": cat_cols,
620 |                "num_cols": num_cols}
621 | 
622 | # COMMAND ----------
623 | 
624 | # Trigger model training and logging to MLflow
625 | month3_run = train_sklearn_rf_model(run_name, 
626 |                         months_gold_path, 
627 |                         model_params, 
628 |                         misc_params)
629 | 
630 | 
631 | month_3_run_id = month3_run.info.run_id     
632 | 
633 | # COMMAND ----------
634 | 
635 | # Register model to MLflow Model Registry
636 | month_3_model_version = mlflow.register_model(model_uri=f"runs:/{month_3_run_id}/model", name=mlflow_experiment_name)
637 | 
638 | # Transition model to Staging
639 | month_3_model_version = transition_model(month_3_model_version, stage="Staging")
640 | print(month_3_model_version)
641 | 
642 | # COMMAND ----------
643 | 
644 | # MAGIC %md
645 | # MAGIC
646 | # MAGIC #### iii. Model checks prior to model deployment
647 | 
648 | # COMMAND ----------
649 | 
650 | # Get the MLflow run associated with the model currently registered in Staging
651 | current_staging_run_2 = get_run_from_registered_model(mlflow_experiment_name, stage="Staging")
652 | 
653 | metric_to_check = "r2_score_X_test"
654 | compare_model_perfs(current_staging_run_2, current_prod_run_2, min_model_r2_threshold, metric_to_check)
655 | 
656 | # COMMAND ----------
657 | 
658 | # MAGIC %md
659 | # MAGIC
660 | # MAGIC In this case we note that the new candidate model in Staging performs notably worse than the current model in Production. We know from our checks prior to training that the label has drifted, and that this was due to new listing prices being recorded during vacation season. At this point we would want to prevent a migration of the new candidate model directly to Production and instead investigate if there is any way we can improve model performance. This could involve tuning the hyperparameters of our model, or additionally investigating the inclusion of additional features such as "month of the year" which could allow us to capture temporal impacts to listing prices.
661 | 


--------------------------------------------------------------------------------