├── README.md └── parallel_models_udf.py /README.md: -------------------------------------------------------------------------------- 1 | # Training many machine learning models in parallel using Databricks and PandasUDFs 2 | 3 | The Databricks Notebook within this repository provides a detailed, step-by-step example of training multiple machine learning models in parallel on different datasets. It includes the following steps. 4 | 5 | - Configuring the Databricks Cluster 6 | - Leveraging PandasUDFs to train machine learning models in parallel on different groups of a dataset. 7 | - Tuning model parameters using Hyperopt 8 | - Logging multiple models to a single MLflow Experiment Run 9 | - Applying multiple models for inference to different groups of data in parallel 10 | 11 | 12 | This repository can be cloned into a Databricks Repo; the code is self contained and can be run in any Databricks environment. The most recent testing of this notebook leveraged the Databricks ML Runtime version 10.5. -------------------------------------------------------------------------------- /parallel_models_udf.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md #Training machine learning models in parallel using PandasUDFs on Databricks 3 | # MAGIC 4 | # MAGIC Data Scientists often need to fit models to different groups of data. A data scientist in real estate industry may find it more effective to create separate models per geographic area due to regional difference that impact model performance. 5 | # MAGIC 6 | # MAGIC PandasUDFs on Databricks provide a mechanism for fitting machine leaning models on different groups of data in parallel. Models can be tuned using Hyperopt, an optimization framework built into the Machine Learning Runtime. Groups of fitted models can be saved to an MLflow Tracking Server instance and promoted to the Model Registry for inference. 7 | 8 | # COMMAND ---------- 9 | 10 | from collections import OrderedDict 11 | import datetime 12 | from pickle import dump 13 | from typing import List, Callable 14 | import csv 15 | 16 | from xgboost import XGBClassifier 17 | import pandas as pd 18 | import numpy as np 19 | 20 | from sklearn.preprocessing import OneHotEncoder, QuantileTransformer 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.impute import SimpleImputer 23 | from sklearn.compose import ColumnTransformer 24 | from sklearn.pipeline import Pipeline 25 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score 26 | from sklearn.datasets import make_classification 27 | 28 | from pyspark import TaskContext 29 | from pyspark.sql.functions import col 30 | import pyspark.sql.functions as func 31 | from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, ArrayType, MapType 32 | 33 | from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials 34 | from hyperopt.early_stop import no_progress_loss 35 | 36 | import mlflow 37 | from mlflow.tracking import MlflowClient 38 | 39 | mlflow.autolog(disable=True) 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %md ## Environment setup 44 | # MAGIC We want to fit each model in parrallel using separate Spark tasks. When working with smaller groups of data, Adaptive Query Execution (AQE) can combine these smaller model fitting tasks into a single, larger task where models are fit sequentially. Since we want to avoid this behavior in this example, we will disable Adaptive Query Execution. Generally, AQE should be left enabled. 45 | 46 | # COMMAND ---------- 47 | 48 | spark.conf.set('spark.sql.adaptive.enabled', 'false') 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %md Also, since we are using Python libraries that can benefit from multiple cores, we can instruct Spark to provide more than one CPU core per tasks by setting **spark.task.cpus** in the Advanced options of the Clusters UI. In the Spark config section under the Spark tab, we set **spark.task.cpus 8**. In our example, we will fit 10 models in parrallel, so we need 80 cores in total to fit all models at the same time. We have also chosen compute optimized instances due to the computational intensity of our UDFs. 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %md ## Generate sample data 57 | # MAGIC We can use a PandasUDF to create synthetic, binary classification training datasets for each group. We'll create a Spark DataFrame containing 10 groups, then use a PandasUDF to generate data for each group. 58 | 59 | # COMMAND ---------- 60 | 61 | groups = [[f'group_{str(n+1).zfill(2)}'] for n in range(10)] 62 | 63 | schema = StructType() 64 | schema.add('group_name', StringType()) 65 | 66 | df = spark.createDataFrame(groups, schema=schema) 67 | display(df) 68 | 69 | # COMMAND ---------- 70 | 71 | def create_group_data(group_data: pd.DataFrame) -> pd.DataFrame: 72 | """ 73 | Generate a synthetic classification dataset 74 | """ 75 | 76 | n_samples = 10000 77 | n_features = 20 78 | 79 | X, y = make_classification(n_samples= n_samples, 80 | n_features= n_features, 81 | n_informative= 10, 82 | n_redundant= 0, 83 | n_classes= 2, 84 | flip_y= 0.4, 85 | random_state= np.random.randint(1,999)) 86 | 87 | numeric_feature_names = [f'numeric_feature_{str(n+1).zfill(2)}' for n in range(n_features)] 88 | categorical_feature_names = [] 89 | 90 | df = pd.DataFrame(X, columns=numeric_feature_names) 91 | 92 | num_categorical_features = 1 93 | 94 | # Convert numeric column to categorical based on quartiles 95 | for numeric_feature_name in numeric_feature_names[:num_categorical_features]: 96 | 97 | numeric_feature_names.remove(numeric_feature_name) 98 | 99 | categorical_name = numeric_feature_name.replace("numeric", "categorical") 100 | 101 | categorical_feature_names.append(categorical_name) 102 | 103 | df[categorical_name] = pd.qcut(df['numeric_feature_01'], 104 | q = [0, .25, .5, .75, 1], 105 | labels=False, 106 | precision = 0) 107 | 108 | df = df[categorical_feature_names + numeric_feature_names] 109 | 110 | # Convert a proportion of values to missing 111 | percent_missing_values = 0.05 112 | mask = np.random.choice([True, False], size=df.shape, p=[percent_missing_values, 1 - percent_missing_values]) 113 | df = df.mask(mask, other=np.nan) 114 | 115 | df['label'] = y 116 | df['group_name'] = group_data["group_name"].loc[0] 117 | 118 | col_ordering = ['group_name', 'label'] + categorical_feature_names + numeric_feature_names 119 | 120 | return df[col_ordering] 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md Our synthetic dataset contains the group name column, label, distinct id of each row, a single categorical feature, and 20 numerical features. 125 | 126 | # COMMAND ---------- 127 | 128 | schema = StructType() 129 | schema.add('group_name', StringType()) 130 | schema.add('label', IntegerType()) 131 | schema.add('categorical_feature_01', FloatType()) 132 | 133 | num_categorical_features = 1 134 | total_features = 20 135 | 136 | for column_name in [f'numeric_feature_{str(n+1).zfill(2)}' for n in range(num_categorical_features, total_features)]: 137 | schema.add(column_name, FloatType()) 138 | 139 | features = (df.groupby('group_name').applyInPandas(create_group_data, schema=schema) 140 | .withColumn('id', func.monotonically_increasing_id())) 141 | 142 | features.write.mode('overwrite').format('delta').partitionBy('group_name').saveAsTable('default.synthetic_group_features') 143 | 144 | features = spark.table('default.synthetic_group_features') 145 | display(features) 146 | 147 | # COMMAND ---------- 148 | 149 | # MAGIC %md ## Feature encoding 150 | # MAGIC We will fit [XGBoost classification models](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier), which requires that our features are encoded. We will use scikit-learn's Pipeline and ColumnTransformer to apply different transformations based on column name. 151 | 152 | # COMMAND ---------- 153 | 154 | def create_preprocessing_transform(categorical_features: List[str], numerical_features: List[str]) -> ColumnTransformer: 155 | 156 | categorical_pipe = Pipeline( 157 | [ 158 | ("onehot", OneHotEncoder(handle_unknown="ignore")) 159 | ] 160 | ) 161 | 162 | numerical_pipe_quantile = Pipeline( 163 | [ 164 | ("imputer", SimpleImputer(strategy="median")) 165 | ] 166 | ) 167 | 168 | preprocessing = ColumnTransformer( 169 | [ 170 | ("categorical", categorical_pipe, categorical_features), 171 | ("numeric", numerical_pipe_quantile, numerical_features) 172 | ], 173 | remainder='drop' 174 | ) 175 | 176 | return preprocessing 177 | 178 | # COMMAND ---------- 179 | 180 | # MAGIC %md ## Fitting mutltiple XGBoost models using a PandasUDF 181 | # MAGIC For a first example, we will create a PandasUDF that will fit XGBoost models separately on the each groups's features. Then, we will incorporate more functionality related to hyper-parameter tuning and MLflow. 182 | 183 | # COMMAND ---------- 184 | 185 | # MAGIC %md ### Define the PandasUDF 186 | # MAGIC 187 | # MAGIC - The **configure_model** wrapper function allows us to make several parameters available to our PandasUDF, **train_model**, which receives a Pandas DataFrame and returns a Pandas DataFrame. 188 | # MAGIC 189 | # MAGIC - The data for each ship will be passed into a separate instance of our PandasUDF and each instance will be executed in parallel in different tasks on our cluster. We will capture information about where each model is trained to confirm this behavior. 190 | # MAGIC 191 | # MAGIC - Since XGBoost fits tree-based models sequentially, we will leverage the built-in early stopping functionality and continue to fit additional trees until the model's predictive capability stops improving. In our example, if a model's performance does not improve after building 25 consecutive trees, we will stop training. 192 | 193 | # COMMAND ---------- 194 | 195 | def configure_model_udf(label_col: str, grouping_col:str, pipeline:ColumnTransformer, test_size:float=0.33, 196 | xgb_early_stopping_rounds:int=25, eval_metric:str="auc", random_state:int=123) -> Callable[[pd.DataFrame], pd.DataFrame]: 197 | 198 | """ 199 | Configure a PandasUDF function and that trains and XGBoost model on a group of data. The UDF is applied 200 | using the groupBy.applyInPandas method. 201 | """ 202 | 203 | def train_model_udf(group_training_data): 204 | 205 | # Measure the training time of each model 206 | start = datetime.datetime.now() 207 | 208 | # Capture the name of the group to be modeled 209 | group_name = group_training_data[grouping_col].loc[0] 210 | 211 | x_train, x_test, y_train, y_test = train_test_split(group_training_data, 212 | group_training_data[label_col], 213 | test_size=test_size, 214 | random_state=random_state) 215 | 216 | # We must pass the testing dataset to the model to leverage early stopping, 217 | # and the training dataset must be transformed. 218 | x_train_transformed = pipeline.fit_transform(x_train) 219 | x_test_transformed = pipeline.transform(x_test) 220 | 221 | # Create a scikit-learning pipeline that transforms the features and applies the 222 | # model 223 | model = XGBClassifier(n_estimators=1000) 224 | 225 | # Fit the model with early stopping 226 | # Note: Early stopping returns the model from the last iteration (not the best one). If there’s more 227 | # than one item in eval_set, the last entry will be used for early stopping. 228 | model.fit(x_train_transformed, y_train.values.ravel(), 229 | eval_set = [(x_test_transformed, y_test.values.ravel())], 230 | eval_metric=eval_metric, 231 | early_stopping_rounds=xgb_early_stopping_rounds, 232 | verbose=True) 233 | 234 | # Capture statistics on the best model run 235 | best_score = model.best_score 236 | 237 | # The best performing number of XGBoost trees 238 | best_iteration = model.best_iteration 239 | best_xgboost_rounds = (0, best_iteration + 1) 240 | 241 | # Predict using only the boosters leading up to and including the best boosting 242 | # round. This accounts for the fact that the model retained by xgboost is the last 243 | # model fit before early stopping rounds were triggered 244 | precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, 245 | model.predict(x_train_transformed, 246 | iteration_range=best_xgboost_rounds), 247 | average='weighted') 248 | 249 | precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, 250 | model.predict(x_test_transformed, 251 | iteration_range=best_xgboost_rounds), 252 | average='weighted') 253 | 254 | train_auc = roc_auc_score(y_train, 255 | model.predict_proba(x_train_transformed, 256 | iteration_range=best_xgboost_rounds)[:,1], 257 | average="weighted") 258 | 259 | end = datetime.datetime.now() 260 | elapsed = end-start 261 | seconds = round(elapsed.total_seconds(), 1) 262 | 263 | # Capture data about our the model 264 | digits = 3 265 | metrics = OrderedDict() 266 | metrics["train_precision"]= round(precision_train, digits) 267 | metrics["train_recall"] = round(recall_train, digits) 268 | metrics["train_f1"] = round(f1_train, digits) 269 | metrics["train_auc"] = round(train_auc, digits) 270 | metrics["test_precision"] = round(precision_test, digits) 271 | metrics["test_recall"] = round(recall_test, digits) 272 | metrics["test_f1"] = round(f1_test, digits) 273 | metrics["test_auc"] = round(best_score, digits) 274 | metrics["best_iteration"] = round(best_iteration, digits) 275 | 276 | other_meta = OrderedDict() 277 | other_meta['group'] = group_name 278 | other_meta['stage_id'] = TaskContext().stageId() 279 | other_meta['task_attempt_id'] = task_attempt_id = TaskContext().taskAttemptId() 280 | other_meta['start_time'] = start.strftime("%d-%b-%Y (%H:%M:%S.%f)") 281 | other_meta['end_time'] = end.strftime("%d-%b-%Y (%H:%M:%S.%f)") 282 | other_meta['elapsed_seconds'] = seconds 283 | 284 | other_meta.update(metrics) 285 | 286 | return pd.DataFrame(other_meta, index=[0]) 287 | 288 | return train_model_udf 289 | 290 | # COMMAND ---------- 291 | 292 | # MAGIC %md ### Apply the PandasUDF 293 | # MAGIC The PandasUDF returns a Pandas DataFrame; we must specify a Spark DataFrame schema that maps to the column names and Python data types returned by the UDF. 294 | 295 | # COMMAND ---------- 296 | 297 | # Specify Spark DataFrame Schema 298 | spark_types = [('group', StringType()), 299 | ('stage_id', IntegerType()), 300 | ('task_attempt_id', IntegerType()), 301 | ('start_time', StringType()), 302 | ('end_time', StringType()), 303 | ('elapsed_seconds', FloatType()), 304 | ('train_precision', FloatType()), 305 | ('train_recall', FloatType()), 306 | ('train_f1', FloatType()), 307 | ('train_auc', FloatType()), 308 | ('test_precision', FloatType()), 309 | ('test_recall', FloatType()), 310 | ('test_f1', FloatType()), 311 | ('test_auc', FloatType()), 312 | ('best_iteration', IntegerType())] 313 | 314 | spark_schema = StructType() 315 | for col_name, spark_type in spark_types: 316 | spark_schema.add(col_name, spark_type) 317 | 318 | 319 | categorical_features = [col for col in features.columns if 'categorical' in col] 320 | numerical_features = [col for col in features.columns if 'numeric' in col] 321 | label_col = ['label'] 322 | grouping_col = 'group_name' 323 | 324 | # COMMAND ---------- 325 | 326 | # MAGIC %md ### View models' output 327 | # MAGIC We will now apply our PandasUDF to a Spark Dataframe, returning a new Spark Dataframe. Our returned DataFrame contains one row per group in our data. We can see that models were fit seprately for each group on different, independent Spark tasks and these tasks were all executed at the same time, and therefore, in the same stage. We also retrieved several performance statistics for each model. 328 | 329 | # COMMAND ---------- 330 | 331 | # Create a pre-processing pipeline instance 332 | pipeline = create_preprocessing_transform(categorical_features, numerical_features) 333 | 334 | # Configure the PandasUDF 335 | train_model_udf = configure_model_udf(label_col, 336 | grouping_col, 337 | pipeline) 338 | 339 | best_model_stats = features.groupBy('group_name').applyInPandas(train_model_udf, schema=spark_schema) 340 | 341 | best_model_stats.write.mode('overwrite').format('delta').saveAsTable('default.best_model_stats') 342 | display(spark.table('default.best_model_stats')) 343 | 344 | # COMMAND ---------- 345 | 346 | # MAGIC %md ## Tuning our models using Hyperopt 347 | # MAGIC Now that we can fit models in parallel on different groups of data, we shift toward model tuning. Compared to arbitrarily choosing different model parameters to test, the Hyperopt optimization library, which is built into the Databricks Machine Learning runtime, provides a more robust mechanism for intelligently searching a broader hyperparameter space, potentially leading to better models. 348 | # MAGIC 349 | # MAGIC We can incorporate a hyper-parameter search using Hyperopt into our Pandas UDF. Let's first fit a simple example on a single group for illustration purposes. 350 | 351 | # COMMAND ---------- 352 | 353 | # MAGIC %md ### Specify a search space 354 | # MAGIC We focus on four parameters to tune using Hyperopt that can help reduce overfitting by [adjusting different behaviors of our XGBoost models](https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster) during training. 355 | # MAGIC - **max_depth**: the maximum depth of each XGBoost tree. 356 | # MAGIC - **lambda**: a regularization parameter that reduces the model's sensitivity to the training data. 357 | # MAGIC - **subsample**: the percent of training data rows that will be sampled before fitting a single tree. 358 | # MAGIC - **colsample_bytree**: the percent of columns that will be sampled before fitting a single tree. 359 | 360 | # COMMAND ---------- 361 | 362 | parameter_search_space = {'n_estimators': 1000, 363 | 'max_depth': hp.quniform('max_depth', 3, 18, 1), 364 | 'lambda': hp.uniform('lambda', 1, 15), 365 | 'subsample': hp.uniform('subsample', 0.5, 1.0), 366 | 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0), 367 | 'eval_metric': 'auc', 368 | 'use_label_encoder': False, 369 | 'random_state': 1} 370 | 371 | # COMMAND ---------- 372 | 373 | # MAGIC %md ### Specify a Hyperopt Objective Function 374 | # MAGIC We create a function that can receive combinations of hyper-parameter values from Hyperopt, fit an XGBoost model using those parameters, and return information to Hyperopt. This information, specifically the models 'loss', will be used by Hyperot to influence which hyper-parameter combinations should be tested next. Our loss will be calculated as 1 - the area under the curve (auc). Thus, we will find the model with the highest auc value, where 1 is the highest possible value. 375 | # MAGIC 376 | # MAGIC Will will again add a parent function, **configure_object_fn**, to pass additional information to our Hyperopt objective function. 377 | 378 | # COMMAND ---------- 379 | 380 | def configure_object_fn(x_train_transformed, y_train, x_test_transformed, y_test, xgb_early_stopping_rounds=25, 381 | eval_metric="auc") -> Callable[[dict], dict]: 382 | """ 383 | Configure a Hyperopt objective function 384 | """ 385 | 386 | def hyperopt_objective_fn(params): 387 | 388 | # Some model parameters require integeger values; change the type in these cases 389 | params['max_depth'] = int(params['max_depth']) 390 | 391 | model = XGBClassifier(**params) 392 | 393 | model.fit(x_train_transformed, y_train.values.ravel(), 394 | eval_set = [(x_test_transformed, y_test.values.ravel())], 395 | # See options here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst 396 | eval_metric=eval_metric, 397 | early_stopping_rounds=xgb_early_stopping_rounds, 398 | verbose=True) 399 | 400 | best_score = model.best_score 401 | best_iteration = model.best_iteration 402 | best_xgboost_rounds = (0, best_iteration + 1) 403 | 404 | precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, 405 | model.predict(x_train_transformed, 406 | iteration_range=best_xgboost_rounds), 407 | average='weighted') 408 | 409 | precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, 410 | model.predict(x_test_transformed, 411 | iteration_range=best_xgboost_rounds), 412 | average='weighted') 413 | 414 | train_auc = roc_auc_score(y_train, 415 | model.predict_proba(x_train_transformed, 416 | iteration_range=best_xgboost_rounds)[:,1], 417 | average="weighted") 418 | 419 | digits = 3 420 | metrics = OrderedDict() 421 | metrics["train_precision"]= round(precision_train, digits) 422 | metrics["train_recall"] = round(recall_train, digits) 423 | metrics["train_f1"] = round(f1_train, digits) 424 | metrics["train_auc"] = round(train_auc, digits) 425 | metrics["test_precision"] = round(precision_test, digits) 426 | metrics["test_recall"] = round(recall_test, digits) 427 | metrics["test_f1"] = round(f1_test, digits) 428 | metrics["test_auc"] = round(best_score, digits) 429 | metrics["best_iteration"] = round(best_iteration, digits) 430 | 431 | return {'status': STATUS_OK, 'loss': 1- best_score, 'metrics': metrics} 432 | 433 | return hyperopt_objective_fn 434 | 435 | # COMMAND ---------- 436 | 437 | # MAGIC %md ### Encode input datasets and configure Hyperopt Objective Function 438 | 439 | # COMMAND ---------- 440 | 441 | features_df = features.filter(col('group_name') == 'group_01').toPandas() 442 | 443 | x_train, x_test, y_train, y_test = train_test_split(features_df, 444 | features_df['label'], 445 | test_size=0.33, 446 | random_state=123) 447 | 448 | pipeline = create_preprocessing_transform(categorical_features, numerical_features) 449 | 450 | x_train_transformed = pipeline.fit_transform(x_train) 451 | x_test_transformed = pipeline.transform(x_test) 452 | 453 | 454 | objective_fn = configure_object_fn(x_train_transformed, y_train, x_test_transformed, y_test) 455 | 456 | # COMMAND ---------- 457 | 458 | # MAGIC %md ### Launch the Hyperopt tuning workflow 459 | 460 | # COMMAND ---------- 461 | 462 | trials = Trials() 463 | 464 | best_params = fmin(fn=objective_fn, 465 | space=parameter_search_space, 466 | algo=tpe.suggest, 467 | max_evals=25, 468 | trials=trials, 469 | rstate=np.random.default_rng(50)) 470 | 471 | # COMMAND ---------- 472 | 473 | # MAGIC %md ### View Hyperopt findings 474 | # MAGIC Hyperopt provides the hyper-parameter combination of the best model as well as validation statistics generated using this model. We can use this information to fit a final model on our full datasets. 475 | 476 | # COMMAND ---------- 477 | 478 | print("Best model parameters \n") 479 | for param, value in best_params.items(): 480 | print(param, value) 481 | 482 | # COMMAND ---------- 483 | 484 | print("Best model statistics \n") 485 | for metric, value in trials.best_trial['result']['metrics'].items(): 486 | print(metric, value) 487 | 488 | # COMMAND ---------- 489 | 490 | print(f"Best Hyperopt trial: {trials.best_trial['tid']}") 491 | 492 | # COMMAND ---------- 493 | 494 | # MAGIC %md ### Fitting a final model 495 | # MAGIC We will construct a final model using the hyperprameter values returned by Hyperopt. There can be differences in the data types returned by Hyperopt and those required by libraries such as XGBoost. We will do some type conversion to account for this. 496 | # MAGIC 497 | # MAGIC The **n_estimators** parameter was found using XGBoosts early stopping functionality. Trees built beyond this number did not improve the model's predictive performance on the test dataset. 498 | # MAGIC 499 | # MAGIC In this example, we will combine the preprocessing pipeline and XGBoost model into a single Pipeline object so that feature transformation and model fitting/inference can occure via a single function call. 500 | 501 | # COMMAND ---------- 502 | 503 | # Collect the best model parameters 504 | final_model_parameters = {} 505 | final_model_parameters['n_estimators'] = trials.best_trial['result']['metrics']['best_iteration'] 506 | 507 | for parameter, value in best_params.items(): 508 | if parameter in ['max_depth']: 509 | final_model_parameters[parameter] = int(value) 510 | else: 511 | final_model_parameters[parameter] = value 512 | 513 | # Specify model 514 | model = XGBClassifier(**final_model_parameters) 515 | 516 | # Define pre-processing pipeline 517 | pipeline = create_preprocessing_transform(categorical_features, numerical_features) 518 | 519 | # Combine pre-processing pipeline and model 520 | model_pipeline = Pipeline([("preprocess", pipeline), ("classifier", model)]) 521 | 522 | # fit pre-processor and model 523 | model_pipeline.fit(features_df, features_df['label']) 524 | 525 | # COMMAND ---------- 526 | 527 | # MAGIC %md ### Performing inference 528 | # MAGIC Notice we are able to call our fitted model Pipeline on raw input data. Our Pipeline handles both feature transformations/encoding and prediction. 529 | 530 | # COMMAND ---------- 531 | 532 | # Probability that passenger survived 533 | predictions = pd.DataFrame(model_pipeline.predict_proba(x_test)[:, 1] , columns=['label_probability']) 534 | predictions = pd.concat([predictions, x_test.reset_index(drop=True)], axis=1) 535 | predictions.head() 536 | 537 | # COMMAND ---------- 538 | 539 | # MAGIC %md ### Incorporating Hyperopt into our PandasUDF 540 | # MAGIC With minor alterations, we can include Hyperopt into our UDF. 541 | 542 | # COMMAND ---------- 543 | 544 | def configure_model_hyperopt_udf(label_col:str, grouping_col:str, pipeline:ColumnTransformer, parameter_search_space, 545 | xgb_early_stopping_rounds:str=25, max_hyperopt_evals:int=25, eval_metric:str="auc", 546 | test_size:float=0.33, random_state:int=123) -> Callable[[pd.DataFrame], pd.DataFrame]: 547 | 548 | """ 549 | Configure a PandasUDF that train models using Hyperopt for hyperparameter tuning 550 | """ 551 | 552 | 553 | def train_model_hyperopt_udf(group_training_data): 554 | 555 | start = datetime.datetime.now() 556 | 557 | group_name = group_training_data[grouping_col].loc[0] 558 | 559 | x_train, x_test, y_train, y_test = train_test_split(group_training_data, 560 | group_training_data[label_col], 561 | test_size=test_size, 562 | random_state=random_state) 563 | 564 | # Transforming features outside of the iterative Hyperopt workflow 565 | x_train_transformed = pipeline.fit_transform(x_train) 566 | x_test_transformed = pipeline.transform(x_test) 567 | 568 | 569 | def hyperopt_objective_fn(params): 570 | 571 | # Some model parameters require integeger values; change the time in these cases 572 | params['max_depth'] = int(params['max_depth']) 573 | 574 | model = XGBClassifier(**params) 575 | 576 | model.fit(x_train_transformed, y_train.values.ravel(), 577 | eval_set = [(x_test_transformed, y_test.values.ravel())], 578 | # See options here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst 579 | eval_metric=eval_metric, 580 | early_stopping_rounds=xgb_early_stopping_rounds, 581 | verbose=True) 582 | 583 | best_score = model.best_score 584 | best_iteration = model.best_iteration 585 | best_xgboost_rounds = (0, best_iteration + 1) 586 | 587 | precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, 588 | model.predict(x_train_transformed, 589 | iteration_range=best_xgboost_rounds), 590 | average='weighted') 591 | 592 | precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, 593 | model.predict(x_test_transformed, 594 | iteration_range=best_xgboost_rounds), 595 | average='weighted') 596 | train_auc = roc_auc_score(y_train, 597 | model.predict_proba(x_train_transformed, 598 | iteration_range=best_xgboost_rounds)[:,1], 599 | average="weighted") 600 | 601 | digits = 3 602 | metrics = OrderedDict() 603 | metrics["train_precision"]= round(precision_train, digits) 604 | metrics["train_recall"] = round(recall_train, digits) 605 | metrics["train_f1"] = round(f1_train, digits) 606 | metrics["train_auc"] = round(train_auc, digits) 607 | metrics["test_precision"] = round(precision_test, digits) 608 | metrics["test_recall"] = round(recall_test, digits) 609 | metrics["test_f1"] = round(f1_test, digits) 610 | metrics["test_auc"] = round(best_score, digits) 611 | metrics["best_iteration"] = round(best_iteration, digits) 612 | 613 | return {'status': STATUS_OK, 'loss': 1- best_score, 'metrics': metrics} 614 | 615 | 616 | trials = Trials() 617 | 618 | best_params = fmin(fn=hyperopt_objective_fn, 619 | space=parameter_search_space, 620 | algo=tpe.suggest, 621 | max_evals=max_hyperopt_evals, 622 | trials=trials, 623 | rstate=np.random.default_rng(50)) 624 | 625 | 626 | # Fit final model with best parameters on full dataset 627 | final_model_parameters = {} 628 | final_model_parameters['n_estimators'] = int(trials.best_trial['result']['metrics']['best_iteration']) 629 | 630 | # Adjust parameter data types to meet xgboost requirements 631 | for parameter, value in best_params.items(): 632 | if parameter in ['max_depth']: 633 | final_model_parameters[parameter] = int(value) 634 | else: 635 | final_model_parameters[parameter] = value 636 | 637 | # Fit the pipeline and model on full dataset 638 | final_model = XGBClassifier(**final_model_parameters) 639 | final_pipeline = Pipeline([("preprocess", pipeline), ("classifier", final_model)]) 640 | 641 | final_pipeline.fit(group_training_data, group_training_data[label_col]) 642 | 643 | end = datetime.datetime.now() 644 | elapsed = end-start 645 | seconds = round(elapsed.total_seconds(), 1) 646 | 647 | # Construct final output 648 | output = OrderedDict() 649 | output['group'] = group_name 650 | output['stage_id'] = TaskContext().stageId() 651 | output['task_attempt_id'] = task_attempt_id = TaskContext().taskAttemptId() 652 | output['start_time'] = start.strftime("%d-%b-%Y (%H:%M:%S.%f)") 653 | output['end_time'] = end.strftime("%d-%b-%Y (%H:%M:%S.%f)") 654 | output['elapsed_seconds'] = seconds 655 | 656 | output.update(trials.best_trial['result']['metrics']) 657 | 658 | return pd.DataFrame(output, index=[0]) 659 | 660 | return train_model_hyperopt_udf 661 | 662 | # COMMAND ---------- 663 | 664 | # Create a pre-processing pipeline instance 665 | pipeline = create_preprocessing_transform(categorical_features, numerical_features) 666 | 667 | # Configure the PandasUDF 668 | train_model_hyperopt_udf = configure_model_hyperopt_udf(label_col, 669 | grouping_col, 670 | pipeline, 671 | parameter_search_space) 672 | 673 | best_model_stats = features.groupBy('group_name').applyInPandas(train_model_hyperopt_udf, schema=spark_schema) 674 | 675 | best_model_stats.write.mode('overwrite').format('delta').saveAsTable('default.best_model_stats') 676 | 677 | display(spark.table('default.best_model_stats')) 678 | 679 | # COMMAND ---------- 680 | 681 | # MAGIC %md ## Tracking model runs and artifacts with MLflow 682 | # MAGIC We can leverage an MLflow Tracking Server to record information about our model runs, as well as artifacts, like fitted models. By leveraging more advanced MLflow capabilities we can also create a PandasUDF for model inference, which will allow us to score different groups of data with the model trained on each group. 683 | 684 | # COMMAND ---------- 685 | 686 | # MAGIC %md ### Create an MLflow Tracking Server instance 687 | # MAGIC You will see an entry in the MLflow Experiments UI for "pandas_udf_models". Our models will be logged to that location. 688 | 689 | # COMMAND ---------- 690 | 691 | def get_or_create_experiment(experiment_location: str) -> None: 692 | 693 | if not mlflow.get_experiment_by_name(experiment_location): 694 | print("Experiment does not exist. Creating experiment") 695 | 696 | mlflow.create_experiment(experiment_location) 697 | 698 | mlflow.set_experiment(experiment_location) 699 | 700 | 701 | experiment_location = '/Shared/pandas_udf_models' 702 | get_or_create_experiment(experiment_location) 703 | 704 | # COMMAND ---------- 705 | 706 | def get_new_run(experiment_location: str, run_name: str) -> str: 707 | """ 708 | Given an MLflow experiment location and a run name, create an 709 | MLflow experiment run to which artifacts can be logged. 710 | """ 711 | 712 | mlflow.set_experiment(experiment_location) 713 | run = mlflow.start_run(run_name=run_name) 714 | run_id = run.to_dictionary()['info']['run_id'] 715 | mlflow.end_run() 716 | 717 | return run_id 718 | 719 | # COMMAND ---------- 720 | 721 | # MAGIC %md ### Working with custom MLflow models 722 | # MAGIC Custom MLflow models provide a way to store special transformations as an MLflow model flavor that can be easily managed. You may want to alter the behavior of a modeling framework's built-in predict method or store a transformation that is not part of a supported ml framework. Both of these use cases are possible with Custom MLflow models. 723 | # MAGIC 724 | # MAGIC See a simple example below that receive and input and then multiplies that input by a number. We can store this custom model in MLflow, load it, and apply it to a Spark DataFrame. 725 | 726 | # COMMAND ---------- 727 | 728 | class CustomPythonModel(mlflow.pyfunc.PythonModel): 729 | def __init__(self, multiply_by): 730 | super().__init__() 731 | self.multiply_by = multiply_by 732 | 733 | def predict(self, context, model_input): 734 | prediction = model_input * self.multiply_by 735 | return prediction 736 | 737 | 738 | with mlflow.start_run() as run: 739 | 740 | run_id = run.info.run_id 741 | print(f"run_id: {run_id}") 742 | 743 | my_custom_model = CustomPythonModel(2) 744 | mlflow.pyfunc.log_model("my_model", python_model=my_custom_model) 745 | 746 | # COMMAND ---------- 747 | 748 | # MAGIC %md Apply the custom model to the features DataFrame 749 | 750 | # COMMAND ---------- 751 | 752 | # Load the model from MLflow 753 | logged_model = f'runs:/{run_id}/my_model' 754 | loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=logged_model, result_type='double') 755 | 756 | # Apply the model to a Spark Dataframe 757 | my_predictions = features.select('numeric_feature_02').withColumn('numeric_feature_02_scaled', loaded_model('numeric_feature_02')) 758 | 759 | display(my_predictions) 760 | 761 | # COMMAND ---------- 762 | 763 | # MAGIC %md ### Creating a custom MLflow models to load each group's model 764 | # MAGIC As we fit models on separate groups of data using our PandasUDF, we will log each fitted model to the same run in MLflow as an artifact. We will also create a "meta" model that given a group of data, will load the group's trained model from MLflow. 765 | 766 | # COMMAND ---------- 767 | 768 | class GroupInferenceModel(mlflow.pyfunc.PythonModel): 769 | """ 770 | A custom MLflow model designed to accept a group of data, 771 | import that group's trained model from MLflow, and return 772 | predictions for the group. 773 | 774 | Attributes: 775 | run_id: The MLflow run id to which all models will be logged 776 | group_name_col: The column name containing the grouping variable 777 | id_cols: The id columns that should be returned along with the 778 | predictions 779 | """ 780 | 781 | def __init__(self, run_id:str, group_name_col:str, id_cols:List[str]): 782 | super().__init__() 783 | self.run_id = run_id 784 | self.group_name_col = group_name_col 785 | self.id_cols = id_cols 786 | 787 | def predict(self, context, model_input: pd.DataFrame) -> pd.DataFrame: 788 | 789 | # Determine the group of the data 790 | group_name = model_input[self.group_name_col].loc[0] 791 | 792 | # Load the group's trained model from MLflow 793 | model_artifact_location = f'runs:/{self.run_id}/models/{group_name}' 794 | 795 | model = mlflow.sklearn.load_model(model_artifact_location) 796 | 797 | predictions = model.predict_proba(model_input) 798 | 799 | output_df = model_input[[self.group_name_col] + self.id_cols] 800 | output_df['probabilities'] = predictions.tolist() 801 | 802 | return output_df 803 | 804 | # COMMAND ---------- 805 | 806 | # MAGIC %md ### Adding the custom MLflow model to our PandasUDF and Logging to MLflow 807 | # MAGIC 808 | # MAGIC Model metrics and parameters are stored in csv files within each group's model artifact directory. We also include the best model parameters found by Hyperopt within our output Delta table as a Spark MapType. 809 | # MAGIC 810 | # MAGIC In addition, we leverage Hyperopt's early stopping functionality. Similar to early stopping for XGBoost, we can end our Hyperopt training runs if performance does not improve. Since we want to find our best models efficiently and are now working with larger datasets, we will instruct Hyperopt to stop testing hyper-parameters if our loss functions does not decrease after 25 trials. 811 | # MAGIC 812 | # MAGIC We set the **early_stop_fn** to **no_progress_loss**, which specifies the theshold beyond which the loss for a trial must improve. We will specify that the model loss must inprove by one half of a percentage point after 25 trials or else model training will stop. Details of the early stopping function [is available here](https://github.com/hyperopt/hyperopt/blob/master/hyperopt/early_stop.py), which is referenced in the [databricks documentation](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/hyperopt-concepts.html#fmin). 813 | 814 | # COMMAND ---------- 815 | 816 | def configure_model_hyperopt_mlflow_udf(label_col:str, grouping_col:str, id_cols:List[str], pipeline:ColumnTransformer, parameter_search_space, 817 | experiment_location:str, run_id:str, xgb_early_stopping_rounds:str=25, max_hyperopt_evals:int=200, 818 | hyperopt_early_stopping_rounds:str=25, hyperopt_early_stopping_threshold:float=0.5, eval_metric:str="auc", 819 | test_size:float=0.33, random_state:int=123) -> Callable[[pd.DataFrame], pd.DataFrame]: 820 | 821 | 822 | # Log the meta model to the parent run 823 | with mlflow.start_run(run_id = run_id) as run: 824 | 825 | meta_model = GroupInferenceModel(run_id, grouping_col, id_cols) 826 | mlflow.pyfunc.log_model("meta_model", python_model=meta_model) 827 | 828 | 829 | def train_model_hyperopt_mlflow_udf(group_training_data: pd.DataFrame) -> pd.DataFrame: 830 | """ 831 | A PandasUDF that give a group of data will train an XGBoost model. Hyperparameter tuning 832 | is performed using Hyperopt. The best model parameters found by Hyperopt is used to train 833 | a final model. The model is logged to MLflow. Model fit statistics and parameters are logged 834 | as .csv files in the model artifact directory in MLflow 835 | """ 836 | 837 | start = datetime.datetime.now() 838 | 839 | group_name = group_training_data[grouping_col].loc[0] 840 | 841 | x_train, x_test, y_train, y_test = train_test_split(group_training_data, 842 | group_training_data[label_col], 843 | test_size=test_size, 844 | random_state=random_state) 845 | 846 | # Transforming features outside of the iterative Hyperopt workflow 847 | x_train_transformed = pipeline.fit_transform(x_train) 848 | x_test_transformed = pipeline.transform(x_test) 849 | 850 | 851 | def hyperopt_objective_fn(params): 852 | 853 | # Some model parameters require integeger values; change the type in these cases 854 | params['max_depth'] = int(params['max_depth']) 855 | 856 | model = XGBClassifier(**params) 857 | 858 | model.fit(x_train_transformed, y_train.values.ravel(), 859 | eval_set = [(x_test_transformed, y_test.values.ravel())], 860 | # See options here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst 861 | eval_metric=eval_metric, 862 | early_stopping_rounds=xgb_early_stopping_rounds, 863 | verbose=True) 864 | 865 | best_score = model.best_score 866 | best_iteration = model.best_iteration 867 | best_xgboost_rounds = (0, best_iteration + 1) 868 | 869 | precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(y_train, 870 | model.predict(x_train_transformed, iteration_range=best_xgboost_rounds), 871 | average='weighted') 872 | 873 | precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, 874 | model.predict(x_test_transformed, iteration_range=best_xgboost_rounds), 875 | average='weighted') 876 | 877 | train_auc = roc_auc_score(y_train, 878 | model.predict_proba(x_train_transformed, iteration_range=best_xgboost_rounds)[:,1], 879 | average="weighted") 880 | 881 | # Capture and return fit statistcs from the Hyperopt Trial 882 | digits = 3 883 | metrics = OrderedDict() 884 | metrics["train_precision"]= round(precision_train, digits) 885 | metrics["train_recall"] = round(recall_train, digits) 886 | metrics["train_f1"] = round(f1_train, digits) 887 | metrics["train_auc"] = round(train_auc, digits) 888 | metrics["test_precision"] = round(precision_test, digits) 889 | metrics["test_recall"] = round(recall_test, digits) 890 | metrics["test_f1"] = round(f1_test, digits) 891 | metrics["test_auc"] = round(best_score, digits) 892 | metrics["best_iteration"] = round(best_iteration, digits) 893 | 894 | return {'status': STATUS_OK, 'loss': 1- best_score, 'metrics': metrics} 895 | 896 | 897 | trials = Trials() 898 | 899 | best_params = fmin(fn=hyperopt_objective_fn, 900 | space=parameter_search_space, 901 | algo=tpe.suggest, 902 | max_evals=max_hyperopt_evals, 903 | trials=trials, 904 | rstate=np.random.default_rng(50), 905 | early_stop_fn=no_progress_loss(iteration_stop_count=hyperopt_early_stopping_rounds, percent_increase=hyperopt_early_stopping_threshold)) 906 | 907 | final_model_parameters = {} 908 | final_model_parameters['n_estimators'] = int(trials.best_trial['result']['metrics']['best_iteration']) 909 | 910 | # Adjust parameter data types to meet xgboost requirements 911 | for parameter, value in best_params.items(): 912 | if parameter in ['max_depth']: 913 | final_model_parameters[parameter] = int(value) 914 | else: 915 | final_model_parameters[parameter] = value 916 | 917 | # Fit the pipeline and model on full dataset 918 | final_model = XGBClassifier(**final_model_parameters) 919 | final_pipeline = Pipeline([("preprocess", pipeline), ("classifier", final_model)]) 920 | 921 | final_pipeline.fit(group_training_data, group_training_data[label_col]) 922 | 923 | end = datetime.datetime.now() 924 | elapsed = end-start 925 | seconds = round(elapsed.total_seconds(), 1) 926 | 927 | mlflow.set_experiment(experiment_location) 928 | with mlflow.start_run(run_id = run_id) as run: 929 | 930 | # Log group model 931 | artifact_path =f'models/{group_name}' 932 | mlflow.sklearn.log_model(final_pipeline, artifact_path=artifact_path) 933 | 934 | # Log group parameters as csv 935 | parameters_column_names = ['parameter', 'value'] 936 | parameters_csv_formatted = [{"parameter": parameter, "value": value} 937 | for parameter, value in final_model_parameters.items()] 938 | 939 | parameters_file_name = '/parameters.csv' 940 | with open(parameters_file_name, 'w') as csvfile: 941 | writer = csv.DictWriter(csvfile, fieldnames=parameters_column_names) 942 | writer.writeheader() 943 | writer.writerows(parameters_csv_formatted) 944 | 945 | mlflow.log_artifact(parameters_file_name, artifact_path=artifact_path) 946 | 947 | # Log group metrics as csv 948 | best_model_metrics = trials.best_trial['result']['metrics'] 949 | 950 | metrics_column_names = ['metric', 'value'] 951 | metrics_csv_formatted = [{'metric': metric, "value": value} 952 | for metric, value in best_model_metrics.items()] 953 | 954 | metrics_file_name = f'/metrics.csv' 955 | with open(metrics_file_name, 'w') as csvfile: 956 | writer = csv.DictWriter(csvfile, fieldnames=metrics_column_names) 957 | writer.writeheader() 958 | writer.writerows(metrics_csv_formatted) 959 | 960 | mlflow.log_artifact(metrics_file_name, artifact_path=artifact_path) 961 | 962 | 963 | # Construct dataframe output 964 | output = OrderedDict() 965 | output['group'] = group_name 966 | output['mlflow_run_id'] = run_id 967 | output['stage_id'] = TaskContext().stageId() 968 | output['task_attempt_id'] = task_attempt_id = TaskContext().taskAttemptId() 969 | output['start_time'] = start.strftime("%d-%b-%Y (%H:%M:%S.%f)") 970 | output['end_time'] = end.strftime("%d-%b-%Y (%H:%M:%S.%f)") 971 | output['elapsed_seconds'] = seconds 972 | output['best_hyperopt_trial'] = trials.best_trial['tid'] 973 | output['best_params'] = [final_model_parameters] 974 | 975 | # Delete XGBoost best interation (number of trees) from the metrics dict 976 | del best_model_metrics['best_iteration'] 977 | output.update(best_model_metrics) 978 | 979 | return pd.DataFrame(output, index=[0]) 980 | 981 | return train_model_hyperopt_mlflow_udf 982 | 983 | # COMMAND ---------- 984 | 985 | # MAGIC %md ### Fitting the models and log to MLflow 986 | 987 | # COMMAND ---------- 988 | 989 | id_cols = ['id'] 990 | 991 | # Add additional columns to out Spark Schema 992 | spark_types = [('group', StringType()), 993 | ('mlflow_run_id', StringType()), 994 | ('stage_id', IntegerType()), 995 | ('task_attempt_id', IntegerType()), 996 | ('start_time', StringType()), 997 | ('end_time', StringType()), 998 | ('elapsed_seconds', FloatType()), 999 | ('train_precision', FloatType()), 1000 | ('train_recall', FloatType()), 1001 | ('train_f1', FloatType()), 1002 | ('train_auc', FloatType()), 1003 | ('test_precision', FloatType()), 1004 | ('test_recall', FloatType()), 1005 | ('test_f1', FloatType()), 1006 | ('test_auc', FloatType()), 1007 | ('best_hyperopt_trial', IntegerType()), 1008 | ('best_params', MapType(StringType(), FloatType()))] 1009 | 1010 | spark_schema = StructType() 1011 | for col_name, spark_type in spark_types: 1012 | spark_schema.add(col_name, spark_type) 1013 | 1014 | pipeline = create_preprocessing_transform(categorical_features, numerical_features) 1015 | 1016 | # Create MLflow parent run 1017 | run_id = get_new_run(experiment_location, "group_model_run") 1018 | 1019 | # Configure Pandas UDF 1020 | train_model_hyperopt_mlflow_udf = configure_model_hyperopt_mlflow_udf(label_col, 1021 | grouping_col, 1022 | id_cols, 1023 | pipeline, 1024 | parameter_search_space, 1025 | experiment_location, 1026 | run_id) 1027 | 1028 | # Fit models by applying UDF 1029 | best_model_stats = features.groupBy('group_name').applyInPandas(train_model_hyperopt_mlflow_udf, schema=spark_schema) 1030 | 1031 | best_model_stats.write.mode('overwrite').format('delta').saveAsTable('default.best_model_stats') 1032 | display(spark.table('default.best_model_stats')) 1033 | 1034 | # COMMAND ---------- 1035 | 1036 | # MAGIC %md ### Promoting the run to the Model Registry 1037 | 1038 | # COMMAND ---------- 1039 | 1040 | # MAGIC %md Create an [MLflow Client](https://www.mlflow.org/docs/latest/python_api/mlflow.tracking.html) instance 1041 | 1042 | # COMMAND ---------- 1043 | 1044 | client = MlflowClient() 1045 | 1046 | # COMMAND ---------- 1047 | 1048 | # MAGIC %md Create a [Model Registry](https://docs.databricks.com/applications/machine-learning/manage-model-lifecycle/index.html) entry if one does not exist 1049 | 1050 | # COMMAND ---------- 1051 | 1052 | model_registry_name = 'pandas_udf_models' 1053 | try: 1054 | client.get_registered_model(model_registry_name) 1055 | print(" Registered model already exists") 1056 | except: 1057 | client.create_registered_model(model_registry_name) 1058 | 1059 | # COMMAND ---------- 1060 | 1061 | # MAGIC %md Create an entry for the model in the registry 1062 | 1063 | # COMMAND ---------- 1064 | 1065 | model_info = client.get_run(run_id).to_dictionary() 1066 | artifact_uri = model_info['info']['artifact_uri'] 1067 | 1068 | 1069 | registered_model = client.create_model_version( 1070 | name = model_registry_name, 1071 | source = artifact_uri + "/meta_model", 1072 | run_id = run_id 1073 | ) 1074 | 1075 | # COMMAND ---------- 1076 | 1077 | # MAGIC %md Move the registered model to the "Production" stage 1078 | 1079 | # COMMAND ---------- 1080 | 1081 | promote_to_prod = client.transition_model_version_stage(name=model_registry_name, 1082 | version = int(registered_model.version), 1083 | stage="Production", 1084 | archive_existing_versions=True) 1085 | 1086 | # COMMAND ---------- 1087 | 1088 | # MAGIC %md ### Create an inference UDF 1089 | # MAGIC Our PandasUDF for inference is reletively simple. We will configure the PandasUDF to load our Production meta model from the Model Registry. Our helper function will capture the model's unique identifier, which we will use to load the model into the notebook. 1090 | # MAGIC 1091 | # MAGIC Similar to our model training UDF, our inference UDF will extract the group name of the data it receives. Then, the UDF will load the approriate group-level model and score the group's data. 1092 | # MAGIC 1093 | # MAGIC Our UDF will return the group name, unique id of the record, and the classification probabilities. 1094 | 1095 | # COMMAND ---------- 1096 | 1097 | def get_model_info(model_name:str, stage:str) -> str: 1098 | """ 1099 | Given a the name of a registered model and a Model Registry stage, 1100 | return the model's unique run id 1101 | """ 1102 | 1103 | from mlflow.tracking import MlflowClient 1104 | 1105 | client = MlflowClient() 1106 | 1107 | run_info = [run for run in client.search_model_versions(f"name='{model_name}'") 1108 | if run.current_stage == stage][0] 1109 | 1110 | return run_info.source 1111 | 1112 | # COMMAND ---------- 1113 | 1114 | # MAGIC %md Define the PandasUDF for applying the meta_model to groups of data 1115 | 1116 | # COMMAND ---------- 1117 | 1118 | def inference_model_config(model_name:str, registry_stage:str='Production') -> Callable[[pd.DataFrame], pd.DataFrame]: 1119 | """ 1120 | Load a model from the Model Registry and return a PandasUDF function. The PandasUDF will apply 1121 | the models to different groups of data via the groupBy.applyInPandas method. 1122 | """ 1123 | 1124 | model_artifact_location = get_model_info(model_name, registry_stage) 1125 | 1126 | model = mlflow.pyfunc.load_model(model_artifact_location) 1127 | 1128 | def apply_models(model_input): 1129 | 1130 | predictions = model.predict(model_input) 1131 | 1132 | return predictions 1133 | 1134 | return apply_models 1135 | 1136 | # COMMAND ---------- 1137 | 1138 | # MAGIC %md Specify the Spark Dataframe schema that maps to the UDF's output. Apply the inference UDF to generate predictions. 1139 | 1140 | # COMMAND ---------- 1141 | 1142 | prediction_schema = StructType() 1143 | prediction_schema.add('group_name', StringType()) 1144 | prediction_schema.add('id', IntegerType()) 1145 | prediction_schema.add('probabilities', ArrayType(FloatType())) 1146 | 1147 | inference_model = inference_model_config(model_name = model_registry_name) 1148 | 1149 | predictions = features.groupBy('group_name').applyInPandas(inference_model, schema=prediction_schema) 1150 | 1151 | display(predictions) 1152 | --------------------------------------------------------------------------------